[x265] [PATCH] asm: use x264 code for sse2 sad[16x64]

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Mon May 4 11:58:33 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1430733507 -19800
#      Mon May 04 15:28:27 2015 +0530
# Node ID 9a1900ca0e4660e28e1d92c0a2a771b94496706c
# Parent  d3fa8d99e44ff4edbf589595401a307d3f79ed9b
asm: use x264 code for sse2 sad[16x64]

diff -r d3fa8d99e44f -r 9a1900ca0e46 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 04 14:08:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon May 04 15:28:27 2015 +0530
@@ -262,7 +262,7 @@
     p.pu[LUMA_16x4].sad  = x265_pixel_sad_16x4_ ## cpu; \
     p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_ ## cpu; \
     p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_ ## cpu; \
-    /*p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu;*/ \
+    p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_ ## cpu; \
     p.pu[LUMA_32x8].sad  = x265_pixel_sad_32x8_ ## cpu; \
     p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_ ## cpu; \
     p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_ ## cpu; \
diff -r d3fa8d99e44f -r 9a1900ca0e46 source/common/x86/sad16-a.asm
--- a/source/common/x86/sad16-a.asm	Mon May 04 14:08:10 2015 +0530
+++ b/source/common/x86/sad16-a.asm	Mon May 04 15:28:27 2015 +0530
@@ -291,9 +291,40 @@
 %endif
 %endmacro
 
-;-----------------------------------------------------------------------------
-; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
-;-----------------------------------------------------------------------------
+%macro SAD_INC_2ROW_Nx64 1
+%if 2*%1 > mmsize
+    movu    m1, [r2 + 0]
+    movu    m2, [r2 + 16]
+    movu    m3, [r2 + 2 * r3 + 0]
+    movu    m4, [r2 + 2 * r3 + 16]
+    psubw   m1, [r0 + 0]
+    psubw   m2, [r0 + 16]
+    psubw   m3, [r0 + 2 * r1 + 0]
+    psubw   m4, [r0 + 2 * r1 + 16]
+    ABSW2   m1, m2, m1, m2, m5, m6
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    ABSW2   m3, m4, m3, m4, m7, m5
+    paddw   m1, m2
+    paddw   m3, m4
+    paddw   m0, m1
+    paddw   m8, m3
+%else
+    movu    m1, [r2]
+    movu    m2, [r2 + 2 * r3]
+    psubw   m1, [r0]
+    psubw   m2, [r0 + 2 * r1]
+    ABSW2   m1, m2, m1, m2, m3, m4
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    paddw   m0, m1
+    paddw   m8, m2
+%endif
+%endmacro
+
+; ---------------------------------------------------------------------------- -
+; int pixel_sad_NxM(uint16_t *, intptr_t, uint16_t *, intptr_t)
+; ---------------------------------------------------------------------------- -
 %macro SAD 2
 cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
     pxor    m0, m0
@@ -317,12 +348,36 @@
     RET
 %endmacro
 
+; ---------------------------------------------------------------------------- -
+; int pixel_sad_Nx64(uint16_t *, intptr_t, uint16_t *, intptr_t)
+; ---------------------------------------------------------------------------- -
+%macro SAD_Nx64 1
+cglobal pixel_sad_%1x64, 4,5-(64&4/4), 9
+    pxor    m0, m0
+    pxor    m8, m8
+    mov     r4d, 64 / 2
+.loop:
+    SAD_INC_2ROW_Nx64 %1
+    dec    r4d
+    jg .loop
+
+    HADDUWD m0, m1
+    HADDUWD m8, m1
+    HADDD   m0, m1
+    HADDD   m8, m1
+    paddd   m0, m8
+
+    movd    eax, xm0
+    RET
+%endmacro
+
 INIT_XMM sse2
 SAD  16,  4
 SAD  16,  8
 SAD  16, 12
 SAD  16, 16
 SAD  16, 32
+SAD_Nx64  16
 
 INIT_XMM sse2
 SAD  8,  4


More information about the x265-devel mailing list