[x264-devel] x86: AVX2 deblock strength
Jason Garrett-Glaser
git at videolan.org
Mon May 20 23:06:47 CEST 2013
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Wed Apr 24 14:22:15 2013 -0700| [b2c30e1a470181b591619b211ae0342e9cc8aac9] | committer: Jason Garrett-Glaser
x86: AVX2 deblock strength
30->18 cycles
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b2c30e1a470181b591619b211ae0342e9cc8aac9
---
common/deblock.c | 7 +++++
common/x86/deblock-a.asm | 71 +++++++++++++++++++++++++++++++++++++++++++++-
tools/checkasm.c | 6 ++--
3 files changed, 80 insertions(+), 4 deletions(-)
diff --git a/common/deblock.c b/common/deblock.c
index 18ed14f..4faf66a 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -686,6 +686,9 @@ void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
void x264_deblock_strength_avx ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
+void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+ int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+ int mvy_limit, int bframe );
void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
@@ -816,6 +819,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
#endif
}
}
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf->deblock_strength = x264_deblock_strength_avx2;
+ }
}
#endif
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index cb94ef8..7d69a56 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -28,8 +28,10 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
+insert_top_shuf: dd 0,1,4,5,7,2,3,6
transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
SECTION .text
@@ -2367,3 +2369,70 @@ INIT_XMM ssse3
DEBLOCK_STRENGTH_XMM
INIT_XMM avx
DEBLOCK_STRENGTH_XMM
+
+%macro LOAD_BYTES_YMM 1
+ movu m0, [%1-4] ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
+ pshufb m0, [load_bytes_shuf] ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
+ mova m2, [insert_top_shuf]
+ vpermq m1, m0, q3131 ; FGHI KLMN PQRS UVWX x2
+ vpermd m0, m2, m0 ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
+ vpbroadcastd m2, [%1-8] ; ABCD ....
+ vpblendd m0, m0, m2, 00010000b ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
+%endmacro
+
+INIT_YMM avx2
+cglobal deblock_strength, 6,6,7
+ ; Prepare mv comparison register
+ shl r4d, 8
+ add r4d, 3 - (1<<8)
+ movd xm6, r4d
+ vpbroadcastw m6, xm6
+ pxor m5, m5 ; bs0,bs1
+
+.lists:
+ ; Check refs
+ LOAD_BYTES_YMM ref
+ pxor m0, m1
+ por m5, m0
+
+ ; Check mvs
+ movu xm0, [mv-4+4*8*0]
+ vinserti128 m0, m0, [mv+4*8*-1], 1
+ vbroadcasti128 m2, [mv+4*8* 0]
+ vinserti128 m1, m2, [mv-4+4*8*1], 0
+ vbroadcasti128 m3, [mv+4*8* 1]
+ psubw m0, m2
+ psubw m1, m3
+
+ vinserti128 m2, m3, [mv-4+4*8*2], 0
+ vbroadcasti128 m4, [mv+4*8* 2]
+ vinserti128 m3, m4, [mv-4+4*8*3], 0
+ psubw m2, m4
+ vbroadcasti128 m4, [mv+4*8* 3]
+ psubw m3, m4
+ packsswb m0, m1
+ packsswb m2, m3
+ pabsb m0, m0
+ pabsb m2, m2
+ psubusb m0, m6
+ psubusb m2, m6
+ packsswb m0, m2
+ por m5, m0
+
+ add r1, 40
+ add r2, 4*8*5
+ dec r5d
+ jge .lists
+
+ ; Check nnz
+ LOAD_BYTES_YMM nnz
+ por m0, m1
+ mova m6, [pb_1]
+ pminub m0, m6
+ pminub m5, m6 ; mv ? 1 : 0
+ paddb m0, m0 ; nnz ? 2 : 0
+ pmaxub m5, m0
+ vextracti128 [bs1], m5, 1
+ pshufb xm5, [transpose_shuf]
+ mova [bs0], xm5
+ RET
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ba363ef..c8ef06f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1689,8 +1689,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
- ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] );
- memset( bs, 99, sizeof(bs) );
+ ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
+ memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
for( int j = 0; j < X264_SCAN8_SIZE; j++ )
nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
for( int j = 0; j < 2; j++ )
@@ -1703,7 +1703,7 @@ static int check_deblock( int cpu_ref, int cpu_new )
set_func_name( "deblock_strength" );
call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
- if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
+ if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
{
ok = 0;
fprintf( stderr, "deblock_strength: [FAILED]\n" );
More information about the x264-devel
mailing list