[x265] [PATCH] asm: assembly code for pixel_sse_ss_4x4
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Fri Nov 22 14:28:27 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385126838 -19800
# Fri Nov 22 18:57:18 2013 +0530
# Node ID 98bcf33302ef613f814056d152c8b8deee2ee0a6
# Parent d2173ec27a151a1c1adaaba387f4f20a43742246
asm: assembly code for pixel_sse_ss_4x4
diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 22 18:57:18 2013 +0530
@@ -87,6 +87,9 @@
p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
+#define ASSGN_SSE_SS(cpu) \
+ p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu;
+
#define SA8D_INTER_FROM_BLOCK(cpu) \
p.sa8d[BLOCK_32x32] = x265_pixel_sa8d_32x32_ ## cpu; \
p.sa8d[BLOCK_64x64] = x265_pixel_sa8d_64x64_ ## cpu; \
@@ -489,6 +492,7 @@
p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
ASSGN_SSE(sse2);
+ ASSGN_SSE_SS(sse2);
INIT2(sad, _sse2);
INIT2(sad_x3, _sse2);
INIT2(sad_x4, _sse2);
@@ -564,6 +568,7 @@
SA8D_INTER_FROM_BLOCK(ssse3);
p.sse_pp[LUMA_4x4] = x265_pixel_ssd_4x4_ssse3;
ASSGN_SSE(ssse3);
+ ASSGN_SSE_SS(ssse3);
PIXEL_AVG(ssse3);
PIXEL_AVG_W4(ssse3);
@@ -625,7 +630,7 @@
p.sa8d[BLOCK_8x8] = x265_pixel_sa8d_8x8_sse4;
p.sa8d[BLOCK_16x16] = x265_pixel_sa8d_16x16_sse4;
SA8D_INTER_FROM_BLOCK(sse4);
-
+ ASSGN_SSE_SS(sse4);
CHROMA_PIXELSUB_PS(_sse4);
CHROMA_FILTERS(_sse4);
@@ -675,6 +680,7 @@
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
HEVC_SATD(avx);
+ ASSGN_SSE_SS(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/pixel-a.asm Fri Nov 22 18:57:18 2013 +0530
@@ -258,6 +258,45 @@
SSD_ONE 16, 16
%endif ; HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+
+%macro HEVC_SSD_SS 0
+cglobal pixel_ssd_ss_4x4, 4,7,6
+ pxor m0, m0
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ lea r0, [r0 + r1*2]
+ lea r2, [r2 + r3*2]
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ lea r0, [r0 + r1*2]
+ lea r2, [r2 + r3*2]
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ lea r0, [r0 + r1*2]
+ lea r2, [r2 + r3*2]
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ phaddd m0, m0
+ phaddd m0, m0
+ movd eax, m0
+ RET
+%endmacro
+
%if HIGH_BIT_DEPTH == 0
%macro SSD_LOAD_FULL 5
mova m1, [t0+%1]
@@ -512,12 +551,17 @@
%define SSD_CORE SSD_CORE_SSE2
%define JOIN JOIN_SSE2
HEVC_SSD
+HEVC_SSD_SS
INIT_XMM ssse3
%define SSD_CORE SSD_CORE_SSSE3
%define JOIN JOIN_SSSE3
HEVC_SSD
+HEVC_SSD_SS
+INIT_XMM sse4
+HEVC_SSD_SS
INIT_XMM avx
HEVC_SSD
+HEVC_SSD_SS
INIT_MMX ssse3
SSD 4, 4
SSD 4, 8
diff -r d2173ec27a15 -r 98bcf33302ef source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Thu Nov 21 20:16:39 2013 +0530
+++ b/source/common/x86/pixel.h Fri Nov 22 18:57:18 2013 +0530
@@ -59,6 +59,9 @@
#define DECL_X1(name, suffix) \
DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
+#define DECL_X1_SS(name, suffix) \
+ DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+
#define DECL_X4(name, suffix) \
DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
@@ -86,6 +89,15 @@
DECL_X1(ssd, avx)
DECL_X1(ssd, xop)
DECL_X1(ssd, avx2)
+DECL_X1_SS(ssd_ss, mmx)
+DECL_X1_SS(ssd_ss, mmx2)
+DECL_X1_SS(ssd_ss, sse2slow)
+DECL_X1_SS(ssd_ss, sse2)
+DECL_X1_SS(ssd_ss, ssse3)
+DECL_X1_SS(ssd_ss, sse4)
+DECL_X1_SS(ssd_ss, avx)
+DECL_X1_SS(ssd_ss, xop)
+DECL_X1_SS(ssd_ss, avx2)
DECL_X1(satd, mmx2)
DECL_X1(satd, sse2)
DECL_X1(satd, ssse3)
More information about the x265-devel
mailing list