[x265] [PATCH] asm: assembly code for sse_ss - 4xN, 8xN, 16xN
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Mon Nov 25 14:01:08 CET 2013
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385384449 -19800
# Mon Nov 25 18:30:49 2013 +0530
# Node ID a69a8392ffeb32d5b136bd315b456b2067cceb29
# Parent 10f605bd053009c8c981c7529322fecd1e54af7b
asm: assembly code for sse_ss - 4xN, 8xN, 16xN
diff -r 10f605bd0530 -r a69a8392ffeb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 18:30:49 2013 +0530
@@ -87,6 +87,21 @@
p.sse_pp[LUMA_32x64] = x265_pixel_ssd_32x64_ ## cpu; \
p.sse_pp[LUMA_16x64] = x265_pixel_ssd_16x64_ ## cpu
+#define ASSGN_SSE_SS(cpu) \
+ p.sse_ss[LUMA_4x4] = x265_pixel_ssd_ss_4x4_ ## cpu; \
+ p.sse_ss[LUMA_4x8] = x265_pixel_ssd_ss_4x8_ ## cpu; \
+ p.sse_ss[LUMA_4x16] = x265_pixel_ssd_ss_4x16_ ## cpu; \
+ p.sse_ss[LUMA_8x4] = x265_pixel_ssd_ss_8x4_ ## cpu; \
+ p.sse_ss[LUMA_8x8] = x265_pixel_ssd_ss_8x8_ ## cpu; \
+ p.sse_ss[LUMA_8x16] = x265_pixel_ssd_ss_8x16_ ## cpu; \
+ p.sse_ss[LUMA_8x32] = x265_pixel_ssd_ss_8x32_ ## cpu; \
+ p.sse_ss[LUMA_16x4] = x265_pixel_ssd_ss_16x4_ ## cpu; \
+ p.sse_ss[LUMA_16x8] = x265_pixel_ssd_ss_16x8_ ## cpu; \
+ p.sse_ss[LUMA_16x12] = x265_pixel_ssd_ss_16x12_ ## cpu; \
+ p.sse_ss[LUMA_16x16] = x265_pixel_ssd_ss_16x16_ ## cpu; \
+ p.sse_ss[LUMA_16x32] = x265_pixel_ssd_ss_16x32_ ## cpu; \
+ p.sse_ss[LUMA_16x64] = x265_pixel_ssd_ss_16x64_ ## cpu; \
+
#define SA8D_INTER_FROM_BLOCK(cpu) \
p.sa8d_inter[LUMA_4x8] = x265_pixel_satd_4x8_ ## cpu; \
p.sa8d_inter[LUMA_8x4] = x265_pixel_satd_8x4_ ## cpu; \
@@ -464,6 +479,7 @@
p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
ASSGN_SSE(sse2);
+ ASSGN_SSE_SS(sse2);
INIT2(sad, _sse2);
INIT2(sad_x3, _sse2);
INIT2(sad_x4, _sse2);
@@ -608,6 +624,7 @@
CHROMA_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
HEVC_SATD(sse4);
+ ASSGN_SSE_SS(sse4);
p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
@@ -675,6 +692,7 @@
SA8D_INTER_FROM_BLOCK(avx);
ASSGN_SSE(avx);
HEVC_SATD(avx);
+ ASSGN_SSE_SS(avx);
p.sad_x3[LUMA_12x16] = x265_pixel_sad_x3_12x16_avx;
p.sad_x4[LUMA_12x16] = x265_pixel_sad_x4_12x16_avx;
diff -r 10f605bd0530 -r a69a8392ffeb source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/pixel-a.asm Mon Nov 25 18:30:49 2013 +0530
@@ -258,6 +258,134 @@
SSD_ONE 16, 16
%endif ; HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; int pixel_ssd_WxH( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH == 0
+%macro SSD_SS 2
+cglobal pixel_ssd_ss_%1x%2, 4,7,6
+ FIX_STRIDES r1, r3
+%if mmsize == %1*4
+ %define offset0_1 r1*2
+ %define offset0_2 r1*4
+ %define offset0_3 r5
+ %define offset1_1 r3*2
+ %define offset1_2 r3*4
+ %define offset1_3 r6
+ lea r5, [4*r1]
+ lea r6, [4*r3]
+ lea r5, [r5 + 2*r1]
+ lea r6, [r6 + 2*r3]
+%elif mmsize == %1*2
+ %define offset0_1 8
+ %define offset0_2 r1*2
+ %define offset0_3 r1*2+8
+ %define offset1_1 8
+ %define offset1_2 r3*2
+ %define offset1_3 r3*2+8
+%elif mmsize == %1
+ %define offset0_1 8
+ %define offset0_2 16
+ %define offset0_3 24
+ %define offset1_1 8
+ %define offset1_2 16
+ %define offset1_3 24
+%endif
+%if %1 == 4
+ %assign %%n %2/(mmsize/%1)
+%else
+ %assign %%n %2/(2*mmsize/%1)
+%endif
+%if %%n > 1
+ mov r4d, %%n
+%endif
+ pxor m0, m0
+.loop
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_1]
+ pmovsxwd m2, [r2 + offset1_1]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_2]
+ pmovsxwd m2, [r2 + offset1_2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_3]
+ pmovsxwd m2, [r2 + offset1_3]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+%if %1 > 4
+ %assign %%m 4/(%1/8)
+ lea r0, [r0+r1*%%m]
+ lea r2, [r2+r3*%%m]
+ pmovsxwd m1, [r0]
+ pmovsxwd m2, [r2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_1]
+ pmovsxwd m2, [r2 + offset1_1]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_2]
+ pmovsxwd m2, [r2 + offset1_2]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+ pmovsxwd m1, [r0 + offset0_3]
+ pmovsxwd m2, [r2 + offset1_3]
+ psubd m1, m2
+ pmulld m1, m1
+ paddd m0, m1
+%endif
+%if %1 == 4
+ lea r0, [r0+r1*(%2/%%n)*2]
+ lea r2, [r2+r3*(%2/%%n)*2]
+%else
+ lea r0, [r0+r1*(%2/%%n)]
+ lea r2, [r2+r3*(%2/%%n)]
+%endif
+%if %%n > 1
+ dec r4d
+ jg .loop
+%endif
+ phaddd m0, m0
+ phaddd m0, m0
+ movd eax, m0
+ RET
+%endmacro
+%macro SSD_SS_ONE 0
+SSD_SS 4, 4
+SSD_SS 4, 8
+SSD_SS 4, 16
+SSD_SS 8, 4
+SSD_SS 8, 8
+SSD_SS 8, 16
+SSD_SS 8, 32
+SSD_SS 16, 4
+SSD_SS 16, 8
+SSD_SS 16, 12
+SSD_SS 16, 16
+SSD_SS 16, 32
+SSD_SS 16, 64
+%endmacro
+
+INIT_XMM sse2
+SSD_SS_ONE
+INIT_XMM sse4
+SSD_SS_ONE
+INIT_XMM avx
+SSD_SS_ONE
+%endif ; !HIGH_BIT_DEPTH
+
%if HIGH_BIT_DEPTH == 0
%macro SSD_LOAD_FULL 5
mova m1, [t0+%1]
diff -r 10f605bd0530 -r a69a8392ffeb source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Fri Nov 22 14:59:34 2013 -0600
+++ b/source/common/x86/pixel.h Mon Nov 25 18:30:49 2013 +0530
@@ -59,6 +59,9 @@
#define DECL_X1(name, suffix) \
DECL_PIXELS(int, name, suffix, (pixel *, intptr_t, pixel *, intptr_t))
+#define DECL_X1_SS(name, suffix) \
+ DECL_PIXELS(int, name, suffix, (int16_t *, intptr_t, int16_t *, intptr_t))
+
#define DECL_X4(name, suffix) \
DECL_PIXELS(void, name ## _x3, suffix, (pixel *, pixel *, pixel *, pixel *, intptr_t, int *)) \
DECL_PIXELS(void, name ## _x4, suffix, (pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int *))
@@ -86,6 +89,15 @@
DECL_X1(ssd, avx)
DECL_X1(ssd, xop)
DECL_X1(ssd, avx2)
+DECL_X1_SS(ssd_ss, mmx)
+DECL_X1_SS(ssd_ss, mmx2)
+DECL_X1_SS(ssd_ss, sse2slow)
+DECL_X1_SS(ssd_ss, sse2)
+DECL_X1_SS(ssd_ss, ssse3)
+DECL_X1_SS(ssd_ss, sse4)
+DECL_X1_SS(ssd_ss, avx)
+DECL_X1_SS(ssd_ss, xop)
+DECL_X1_SS(ssd_ss, avx2)
DECL_X1(satd, mmx2)
DECL_X1(satd, sse2)
DECL_X1(satd, ssse3)
More information about the x265-devel
mailing list