[x264-devel] x86: port SSE2+ SATD functions to high bit depth

Oskar Arvidsson git at videolan.org
Wed Feb 27 00:18:04 CET 2013


x264 | branch: master | Oskar Arvidsson <oskar at irock.se> | Tue Jan 29 23:44:32 2013 +0100| [5c2ca5dee339a215cb331c426d40fa548675f088] | committer: Jason Garrett-Glaser

x86: port SSE2+ SATD functions to high bit depth

Makes SATD 20-50% faster across all partition sizes but 4x4.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5c2ca5dee339a215cb331c426d40fa548675f088
---

 common/pixel.c         |    8 ++++
 common/x86/pixel-a.asm |  103 ++++++++++++++++++++++++++++++++++--------------
 common/x86/x86util.asm |   22 +++++++----
 3 files changed, 96 insertions(+), 37 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 4df3892..a09d567 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -874,6 +874,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     {
         INIT4_NAME( sad_aligned, sad, _sse2_aligned );
         INIT5( ssd, _sse2 );
+        INIT6( satd, _sse2 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
 
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
@@ -924,6 +926,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( sad_x3, _ssse3 );
         INIT7( sad_x4, _ssse3 );
         INIT_ADS( _ssse3 );
+        INIT6( satd, _ssse3 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
 
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
@@ -941,6 +945,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     }
     if( cpu&X264_CPU_SSE4 )
     {
+        INIT6( satd, _sse4 );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _sse4 );
@@ -951,6 +957,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     if( cpu&X264_CPU_AVX )
     {
         INIT_ADS( _avx );
+        INIT6( satd, _avx );
+        pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _avx );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d4aaa3f..587cc80 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1021,7 +1021,7 @@ VAR2_8x8_SSSE3 16, 7
     DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
 %endmacro
 
-%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
+%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
 ; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
     LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
     LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
@@ -1079,7 +1079,7 @@ VAR2_8x8_SSSE3 16, 7
 %endmacro
 
 %macro SATD_8x4_SSE 8-9
-%ifidn %1, sse2
+%if %1
     HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
 %else
     HADAMARD4_V %2, %3, %4, %5, %6
@@ -1093,7 +1093,7 @@ VAR2_8x8_SSSE3 16, 7
 %else
     SWAP %8, %2
 %endif
-%ifidn %1, sse2
+%if %1
     paddw m%8, m%4
 %else
     HADAMARD 1, max, %3, %5, %6, %7
@@ -1248,8 +1248,11 @@ cglobal pixel_satd_4x4, 4,6
     SATD_4x4_MMX m0, 0, 0
     SATD_END_MMX
 
-%macro SATD_START_SSE2 2
-%if cpuflag(ssse3)
+%macro SATD_START_SSE2 2-3 0
+    FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH && %3
+    pxor    %2, %2
+%elif cpuflag(ssse3)
     mova    %2, [hmul_8p]
 %endif
     lea     r4, [3*r1]
@@ -1257,12 +1260,27 @@ cglobal pixel_satd_4x4, 4,6
     pxor    %1, %1
 %endmacro
 
-%macro SATD_END_SSE2 1
+%macro SATD_END_SSE2 1-2
+%if HIGH_BIT_DEPTH
+    HADDUW  %1, m0
+%if %0 == 2
+    paddd   %1, %2
+%endif
+%else
     HADDW   %1, m7
+%endif
     movd   eax, %1
     RET
 %endmacro
 
+%macro SATD_ACCUM 3
+%if HIGH_BIT_DEPTH
+    HADDUW %1, %2
+    paddd  %3, %1
+    pxor   %1, %1
+%endif
+%endmacro
+
 %macro BACKUP_POINTERS 0
 %if ARCH_X86_64
 %if WIN64
@@ -1275,20 +1293,44 @@ cglobal pixel_satd_4x4, 4,6
 
 %macro RESTORE_AND_INC_POINTERS 0
 %if ARCH_X86_64
-    lea     r0, [r6+8]
-    lea     r2, [r7+8]
+    lea     r0, [r6+8*SIZEOF_PIXEL]
+    lea     r2, [r7+8*SIZEOF_PIXEL]
 %if WIN64
     POP r7
 %endif
 %else
     mov     r0, r0mp
     mov     r2, r2mp
-    add     r0, 8
-    add     r2, 8
+    add     r0, 8*SIZEOF_PIXEL
+    add     r2, 8*SIZEOF_PIXEL
 %endif
 %endmacro
 
 %macro SATD_4x8_SSE 2
+%if HIGH_BIT_DEPTH
+    movh    m0, [r0+0*r1]
+    movh    m4, [r2+0*r3]
+    movh    m1, [r0+1*r1]
+    movh    m5, [r2+1*r3]
+    movhps  m0, [r0+4*r1]
+    movhps  m4, [r2+4*r3]
+    movh    m2, [r0+2*r1]
+    movh    m6, [r2+2*r3]
+    psubw   m0, m4
+    movh    m3, [r0+r4]
+    movh    m4, [r2+r5]
+    lea     r0, [r0+4*r1]
+    lea     r2, [r2+4*r3]
+    movhps  m1, [r0+1*r1]
+    movhps  m5, [r2+1*r3]
+    movhps  m2, [r0+2*r1]
+    movhps  m6, [r2+2*r3]
+    psubw   m1, m5
+    movhps  m3, [r0+r4]
+    movhps  m4, [r2+r5]
+    psubw   m2, m6
+    psubw   m3, m4
+%else ; !HIGH_BIT_DEPTH
     movd m4, [r2]
     movd m5, [r2+r3]
     movd m6, [r2+2*r3]
@@ -1329,7 +1371,8 @@ cglobal pixel_satd_4x4, 4,6
 %else
     DIFFOP 2, 6, 3, 5, 7
 %endif
-    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endif ; HIGH_BIT_DEPTH
+    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -1355,43 +1398,42 @@ cglobal pixel_satd_4x4, 4, 6, 6
 
 cglobal pixel_satd_4x8, 4, 6, 8
     SATD_START_MMX
-%if cpuflag(ssse3)
-    mova m7, [hmul_4p]
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+    mova   m7, [hmul_4p]
 %endif
     SATD_4x8_SSE 0, swap
-    HADDW m7, m1
-    movd eax, m7
+    HADDW  m7, m1
+    movd  eax, m7
     RET
 
 cglobal pixel_satd_4x16, 4, 6, 8
     SATD_START_MMX
-%if cpuflag(ssse3)
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
     mova m7, [hmul_4p]
 %endif
     SATD_4x8_SSE 0, swap
-    lea r0, [r0+r1*2]
-    lea r2, [r2+r3*2]
+    lea r0, [r0+r1*2*SIZEOF_PIXEL]
+    lea r2, [r2+r3*2*SIZEOF_PIXEL]
     SATD_4x8_SSE 1, add
     HADDW m7, m1
     movd eax, m7
     RET
 
 cglobal pixel_satd_8x8_internal
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
-    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
 %%pixel_satd_8x4_internal:
-    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
-    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+    LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+    SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
     ret
 
-%if UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
 cglobal pixel_satd_16x4_internal
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
     lea  r0, [r0+4*r1]
-    ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
-    SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
-    SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
+    SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
+    SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
     ret
 
 cglobal pixel_satd_16x8, 4,6,12
@@ -1422,14 +1464,15 @@ cglobal pixel_satd_16x8, 4,6,8
     SATD_END_SSE2 m6
 
 cglobal pixel_satd_16x16, 4,6,8
-    SATD_START_SSE2 m6, m7
+    SATD_START_SSE2 m6, m7, 1
     BACKUP_POINTERS
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
+    SATD_ACCUM m6, m0, m7
     RESTORE_AND_INC_POINTERS
     call pixel_satd_8x8_internal
     call pixel_satd_8x8_internal
-    SATD_END_SSE2 m6
+    SATD_END_SSE2 m6, m7
 %endif
 
 cglobal pixel_satd_8x16, 4,6,8
@@ -2525,7 +2568,7 @@ ALIGN 16
     psubw      m1, m9
     psubw      m2, m10
     psubw      m3, m11
-    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
+    SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 13, 14, 0, swap
     pmaddwd    m0, [pw_1]
 %if cpuflag(sse4)
     pshufd     m1, m0, q0032
@@ -2633,7 +2676,7 @@ ALIGN 16
     psubw      m2, [fenc_buf+0x20]
 .satd_8x4b:
     psubw      m3, [fenc_buf+0x30]
-    SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
+    SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 0, swap
     pmaddwd    m0, [pw_1]
 %if cpuflag(sse4)
     pshufd     m1, m0, q0032
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 4b88863..dc189c0 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -675,11 +675,18 @@
 %endmacro
 
 
-%macro LOAD_DIFF 5
+%macro LOAD_DIFF 5-6 1
 %if HIGH_BIT_DEPTH
+%if %6 ; %5 aligned?
     mova       %1, %4
     psubw      %1, %5
-%elifidn %3, none
+%else
+    movu       %1, %4
+    movu       %2, %5
+    psubw      %1, %2
+%endif
+%else ; !HIGH_BIT_DEPTH
+%ifidn %3, none
     movh       %1, %4
     movh       %2, %5
     punpcklbw  %1, %2
@@ -692,6 +699,7 @@
     punpcklbw  %2, %3
     psubw      %1, %2
 %endif
+%endif ; HIGH_BIT_DEPTH
 %endmacro
 
 %macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
@@ -742,11 +750,11 @@
     movh   [r0+3*FDEC_STRIDE], %4
 %endmacro
 
-%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
-    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9]
-    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3]
-    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
-    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5]
+%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
+    LOAD_DIFF m%1, m%5, m%7, [%8],      [%9],      %11
+    LOAD_DIFF m%2, m%6, m%7, [%8+r1],   [%9+r3],   %11
+    LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
+    LOAD_DIFF m%4, m%6, m%7, [%8+r4],   [%9+r5],   %11
 %if %10
     lea %8, [%8+4*r1]
     lea %9, [%9+4*r3]



More information about the x264-devel mailing list