[x265] [PATCH] asm: use x264 code for sse2 sad[16x64]
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Mon May 4 11:58:33 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1430733507 -19800
# Mon May 04 15:28:27 2015 +0530
# Node ID 9a1900ca0e4660e28e1d92c0a2a771b94496706c
# Parent d3fa8d99e44ff4edbf589595401a307d3f79ed9b
asm: use x264 code for sse2 sad[16x64]
diff -r d3fa8d99e44f -r 9a1900ca0e46 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 04 14:08:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon May 04 15:28:27 2015 +0530
@@ -262,7 +262,7 @@
p.pu[LUMA_16x4].sad = x265_pixel_sad_16x4_ ## cpu; \
p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_ ## cpu; \
p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_ ## cpu; \
- /*p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu;*/ \
+ p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu; \
p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_ ## cpu; \
p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_ ## cpu; \
p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_ ## cpu; \
diff -r d3fa8d99e44f -r 9a1900ca0e46 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm Mon May 04 14:08:10 2015 +0530
+++ b/source/common/x86/sad16-a.asm Mon May 04 15:28:27 2015 +0530
@@ -291,9 +291,40 @@
%endif
%endmacro
-;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
-;-----------------------------------------------------------------------------
+%macro SAD_INC_2ROW_Nx64 1
+%if 2*%1 > mmsize
+ movu m1, [r2 + 0]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 2 * r3 + 0]
+ movu m4, [r2 + 2 * r3 + 16]
+ psubw m1, [r0 + 0]
+ psubw m2, [r0 + 16]
+ psubw m3, [r0 + 2 * r1 + 0]
+ psubw m4, [r0 + 2 * r1 + 16]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ paddw m1, m2
+ paddw m3, m4
+ paddw m0, m1
+ paddw m8, m3
+%else
+ movu m1, [r2]
+ movu m2, [r2 + 2 * r3]
+ psubw m1, [r0]
+ psubw m2, [r0 + 2 * r1]
+ ABSW2 m1, m2, m1, m2, m3, m4
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ paddw m0, m1
+ paddw m8, m2
+%endif
+%endmacro
+
+; ---------------------------------------------------------------------------- -
+; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
+; ---------------------------------------------------------------------------- -
%macro SAD 2
cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
pxor m0, m0
@@ -317,12 +348,36 @@
RET
%endmacro
+; ---------------------------------------------------------------------------- -
+; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
+; ---------------------------------------------------------------------------- -
+%macro SAD_Nx64 1
+cglobal pixel_sad_%1x64, 4,5-(64&4/4), 9
+ pxor m0, m0
+ pxor m8, m8
+ mov r4d, 64 / 2
+.loop:
+ SAD_INC_2ROW_Nx64 %1
+ dec r4d
+ jg .loop
+
+ HADDUWD m0, m1
+ HADDUWD m8, m1
+ HADDD m0, m1
+ HADDD m8, m1
+ paddd m0, m8
+
+ movd eax, xm0
+ RET
+%endmacro
+
INIT_XMM sse2
SAD 16, 4
SAD 16, 8
SAD 16, 12
SAD 16, 16
SAD 16, 32
+SAD_Nx64 16
INIT_XMM sse2
SAD 8, 4
More information about the x265-devel
mailing list