[x264-devel] x86: port SSE2+ SATD functions to high bit depth
Oskar Arvidsson
git at videolan.org
Wed Feb 27 00:18:04 CET 2013
x264 | branch: master | Oskar Arvidsson <oskar at irock.se> | Tue Jan 29 23:44:32 2013 +0100| [5c2ca5dee339a215cb331c426d40fa548675f088] | committer: Jason Garrett-Glaser
x86: port SSE2+ SATD functions to high bit depth
Makes SATD 20-50% faster across all partition sizes but 4x4.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5c2ca5dee339a215cb331c426d40fa548675f088
---
common/pixel.c | 8 ++++
common/x86/pixel-a.asm | 103 ++++++++++++++++++++++++++++++++++--------------
common/x86/x86util.asm | 22 +++++++----
3 files changed, 96 insertions(+), 37 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 4df3892..a09d567 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -874,6 +874,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
INIT4_NAME( sad_aligned, sad, _sse2_aligned );
INIT5( ssd, _sse2 );
+ INIT6( satd, _sse2 );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2;
@@ -924,6 +926,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
INIT_ADS( _ssse3 );
+ INIT6( satd, _ssse3 );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
@@ -941,6 +945,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
if( cpu&X264_CPU_SSE4 )
{
+ INIT6( satd, _sse4 );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_sse4;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _sse4 );
@@ -951,6 +957,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT_ADS( _avx );
+ INIT6( satd, _avx );
+ pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d4aaa3f..587cc80 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1021,7 +1021,7 @@ VAR2_8x8_SSSE3 16, 7
DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5
%endmacro
-%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0
+%macro LOAD_SUMSUB_8x4P_SSSE3 7-11 r0, r2, 0, 0
; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?]
LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3]
LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]
@@ -1079,7 +1079,7 @@ VAR2_8x8_SSSE3 16, 7
%endmacro
%macro SATD_8x4_SSE 8-9
-%ifidn %1, sse2
+%if %1
HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax
%else
HADAMARD4_V %2, %3, %4, %5, %6
@@ -1093,7 +1093,7 @@ VAR2_8x8_SSSE3 16, 7
%else
SWAP %8, %2
%endif
-%ifidn %1, sse2
+%if %1
paddw m%8, m%4
%else
HADAMARD 1, max, %3, %5, %6, %7
@@ -1248,8 +1248,11 @@ cglobal pixel_satd_4x4, 4,6
SATD_4x4_MMX m0, 0, 0
SATD_END_MMX
-%macro SATD_START_SSE2 2
-%if cpuflag(ssse3)
+%macro SATD_START_SSE2 2-3 0
+ FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH && %3
+ pxor %2, %2
+%elif cpuflag(ssse3)
mova %2, [hmul_8p]
%endif
lea r4, [3*r1]
@@ -1257,12 +1260,27 @@ cglobal pixel_satd_4x4, 4,6
pxor %1, %1
%endmacro
-%macro SATD_END_SSE2 1
+%macro SATD_END_SSE2 1-2
+%if HIGH_BIT_DEPTH
+ HADDUW %1, m0
+%if %0 == 2
+ paddd %1, %2
+%endif
+%else
HADDW %1, m7
+%endif
movd eax, %1
RET
%endmacro
+%macro SATD_ACCUM 3
+%if HIGH_BIT_DEPTH
+ HADDUW %1, %2
+ paddd %3, %1
+ pxor %1, %1
+%endif
+%endmacro
+
%macro BACKUP_POINTERS 0
%if ARCH_X86_64
%if WIN64
@@ -1275,20 +1293,44 @@ cglobal pixel_satd_4x4, 4,6
%macro RESTORE_AND_INC_POINTERS 0
%if ARCH_X86_64
- lea r0, [r6+8]
- lea r2, [r7+8]
+ lea r0, [r6+8*SIZEOF_PIXEL]
+ lea r2, [r7+8*SIZEOF_PIXEL]
%if WIN64
POP r7
%endif
%else
mov r0, r0mp
mov r2, r2mp
- add r0, 8
- add r2, 8
+ add r0, 8*SIZEOF_PIXEL
+ add r2, 8*SIZEOF_PIXEL
%endif
%endmacro
%macro SATD_4x8_SSE 2
+%if HIGH_BIT_DEPTH
+ movh m0, [r0+0*r1]
+ movh m4, [r2+0*r3]
+ movh m1, [r0+1*r1]
+ movh m5, [r2+1*r3]
+ movhps m0, [r0+4*r1]
+ movhps m4, [r2+4*r3]
+ movh m2, [r0+2*r1]
+ movh m6, [r2+2*r3]
+ psubw m0, m4
+ movh m3, [r0+r4]
+ movh m4, [r2+r5]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ movhps m1, [r0+1*r1]
+ movhps m5, [r2+1*r3]
+ movhps m2, [r0+2*r1]
+ movhps m6, [r2+2*r3]
+ psubw m1, m5
+ movhps m3, [r0+r4]
+ movhps m4, [r2+r5]
+ psubw m2, m6
+ psubw m3, m4
+%else ; !HIGH_BIT_DEPTH
movd m4, [r2]
movd m5, [r2+r3]
movd m6, [r2+2*r3]
@@ -1329,7 +1371,8 @@ cglobal pixel_satd_4x4, 4,6
%else
DIFFOP 2, 6, 3, 5, 7
%endif
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 7, %2
+%endif ; HIGH_BIT_DEPTH
+ SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 7, %2
%endmacro
;-----------------------------------------------------------------------------
@@ -1355,43 +1398,42 @@ cglobal pixel_satd_4x4, 4, 6, 6
cglobal pixel_satd_4x8, 4, 6, 8
SATD_START_MMX
-%if cpuflag(ssse3)
- mova m7, [hmul_4p]
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
+ mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
- HADDW m7, m1
- movd eax, m7
+ HADDW m7, m1
+ movd eax, m7
RET
cglobal pixel_satd_4x16, 4, 6, 8
SATD_START_MMX
-%if cpuflag(ssse3)
+%if HIGH_BIT_DEPTH == 0 && cpuflag(ssse3)
mova m7, [hmul_4p]
%endif
SATD_4x8_SSE 0, swap
- lea r0, [r0+r1*2]
- lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2*SIZEOF_PIXEL]
+ lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE 1, add
HADDW m7, m1
movd eax, m7
RET
cglobal pixel_satd_8x8_internal
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
%%pixel_satd_8x4_internal:
- LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 6
+ LOAD_SUMSUB_8x4P 0, 1, 2, 3, 4, 5, 7, r0, r2, 1, 0
+ SATD_8x4_SSE (HIGH_BIT_DEPTH || cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 6
ret
-%if UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
lea r0, [r0+4*r1]
- ; FIXME: this doesn't really mean ssse3, but rather selects between two different behaviors implemented with sse2?
- SATD_8x4_SSE ssse3, 0, 1, 2, 3, 6, 11, 10
- SATD_8x4_SSE ssse3, 4, 8, 5, 9, 6, 3, 10
+ SATD_8x4_SSE 0, 0, 1, 2, 3, 6, 11, 10
+ SATD_8x4_SSE 0, 4, 8, 5, 9, 6, 3, 10
ret
cglobal pixel_satd_16x8, 4,6,12
@@ -1422,14 +1464,15 @@ cglobal pixel_satd_16x8, 4,6,8
SATD_END_SSE2 m6
cglobal pixel_satd_16x16, 4,6,8
- SATD_START_SSE2 m6, m7
+ SATD_START_SSE2 m6, m7, 1
BACKUP_POINTERS
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
+ SATD_ACCUM m6, m0, m7
RESTORE_AND_INC_POINTERS
call pixel_satd_8x8_internal
call pixel_satd_8x8_internal
- SATD_END_SSE2 m6
+ SATD_END_SSE2 m6, m7
%endif
cglobal pixel_satd_8x16, 4,6,8
@@ -2525,7 +2568,7 @@ ALIGN 16
psubw m1, m9
psubw m2, m10
psubw m3, m11
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 13, 14, 0, swap
+ SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 13, 14, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
@@ -2633,7 +2676,7 @@ ALIGN 16
psubw m2, [fenc_buf+0x20]
.satd_8x4b:
psubw m3, [fenc_buf+0x30]
- SATD_8x4_SSE cpuname, 0, 1, 2, 3, 4, 5, 0, swap
+ SATD_8x4_SSE (cpuflags == cpuflags_sse2), 0, 1, 2, 3, 4, 5, 0, swap
pmaddwd m0, [pw_1]
%if cpuflag(sse4)
pshufd m1, m0, q0032
diff --git a/common/x86/x86util.asm b/common/x86/x86util.asm
index 4b88863..dc189c0 100644
--- a/common/x86/x86util.asm
+++ b/common/x86/x86util.asm
@@ -675,11 +675,18 @@
%endmacro
-%macro LOAD_DIFF 5
+%macro LOAD_DIFF 5-6 1
%if HIGH_BIT_DEPTH
+%if %6 ; %5 aligned?
mova %1, %4
psubw %1, %5
-%elifidn %3, none
+%else
+ movu %1, %4
+ movu %2, %5
+ psubw %1, %2
+%endif
+%else ; !HIGH_BIT_DEPTH
+%ifidn %3, none
movh %1, %4
movh %2, %5
punpcklbw %1, %2
@@ -692,6 +699,7 @@
punpcklbw %2, %3
psubw %1, %2
%endif
+%endif ; HIGH_BIT_DEPTH
%endmacro
%macro LOAD_DIFF8x4 8 ; 4x dst, 1x tmp, 1x mul, 2x ptr
@@ -742,11 +750,11 @@
movh [r0+3*FDEC_STRIDE], %4
%endmacro
-%macro LOAD_DIFF_8x4P 7-10 r0,r2,0 ; 4x dest, 2x temp, 2x pointer, increment?
- LOAD_DIFF m%1, m%5, m%7, [%8], [%9]
- LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3]
- LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3]
- LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5]
+%macro LOAD_DIFF_8x4P 7-11 r0,r2,0,1 ; 4x dest, 2x temp, 2x pointer, increment, aligned?
+ LOAD_DIFF m%1, m%5, m%7, [%8], [%9], %11
+ LOAD_DIFF m%2, m%6, m%7, [%8+r1], [%9+r3], %11
+ LOAD_DIFF m%3, m%5, m%7, [%8+2*r1], [%9+2*r3], %11
+ LOAD_DIFF m%4, m%6, m%7, [%8+r4], [%9+r5], %11
%if %10
lea %8, [%8+4*r1]
lea %9, [%9+4*r3]
More information about the x264-devel
mailing list