[x264-devel] x86: AVX2 deblock strength

Jason Garrett-Glaser git at videolan.org
Mon May 20 23:06:47 CEST 2013


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Wed Apr 24 14:22:15 2013 -0700| [b2c30e1a470181b591619b211ae0342e9cc8aac9] | committer: Jason Garrett-Glaser

x86: AVX2 deblock strength

30->18 cycles

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b2c30e1a470181b591619b211ae0342e9cc8aac9
---

 common/deblock.c         |    7 +++++
 common/x86/deblock-a.asm |   71 +++++++++++++++++++++++++++++++++++++++++++++-
 tools/checkasm.c         |    6 ++--
 3 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/common/deblock.c b/common/deblock.c
index 18ed14f..4faf66a 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -686,6 +686,9 @@ void x264_deblock_strength_ssse3( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X2
 void x264_deblock_strength_avx  ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                   int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                   int mvy_limit, int bframe );
+void x264_deblock_strength_avx2 ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
+                                  int mvy_limit, int bframe );
 
 void x264_deblock_h_chroma_intra_mbaff_mmx2( pixel *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_intra_mbaff_sse2( pixel *pix, intptr_t stride, int alpha, int beta );
@@ -816,6 +819,10 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
 #endif
             }
         }
+        if( cpu&X264_CPU_AVX2 )
+        {
+            pf->deblock_strength = x264_deblock_strength_avx2;
+        }
     }
 #endif
 
diff --git a/common/x86/deblock-a.asm b/common/x86/deblock-a.asm
index cb94ef8..7d69a56 100644
--- a/common/x86/deblock-a.asm
+++ b/common/x86/deblock-a.asm
@@ -28,8 +28,10 @@
 %include "x86inc.asm"
 %include "x86util.asm"
 
-SECTION_RODATA
+SECTION_RODATA 32
 
+load_bytes_shuf: times 2 db 3,4,5,6,11,12,13,14,4,5,6,7,12,13,14,15
+insert_top_shuf: dd 0,1,4,5,7,2,3,6
 transpose_shuf: db 0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15
 
 SECTION .text
@@ -2367,3 +2369,70 @@ INIT_XMM ssse3
 DEBLOCK_STRENGTH_XMM
 INIT_XMM avx
 DEBLOCK_STRENGTH_XMM
+
+%macro LOAD_BYTES_YMM 1
+    movu         m0, [%1-4]             ; ___E FGHI ___J KLMN ___O PQRS ___T UVWX
+    pshufb       m0, [load_bytes_shuf]  ; EFGH JKLM FGHI KLMN OPQR TUVW PQRS UVWX
+    mova         m2, [insert_top_shuf]
+    vpermq       m1, m0, q3131          ; FGHI KLMN PQRS UVWX x2
+    vpermd       m0, m2, m0             ; EFGH JKLM OPQR TUVW ____ FGHI KLMN PQRS
+    vpbroadcastd m2, [%1-8]             ; ABCD ....
+    vpblendd     m0, m0, m2, 00010000b  ; EFGH JKLM OPQR TUVW ABCD FGHI KLMN PQRS
+%endmacro
+
+INIT_YMM avx2
+cglobal deblock_strength, 6,6,7
+    ; Prepare mv comparison register
+    shl      r4d, 8
+    add      r4d, 3 - (1<<8)
+    movd     xm6, r4d
+    vpbroadcastw m6, xm6
+    pxor      m5, m5 ; bs0,bs1
+
+.lists:
+    ; Check refs
+    LOAD_BYTES_YMM ref
+    pxor      m0, m1
+    por       m5, m0
+
+    ; Check mvs
+    movu     xm0, [mv-4+4*8*0]
+    vinserti128 m0, m0, [mv+4*8*-1], 1
+    vbroadcasti128  m2, [mv+4*8* 0]
+    vinserti128 m1, m2, [mv-4+4*8*1], 0
+    vbroadcasti128  m3, [mv+4*8* 1]
+    psubw     m0, m2
+    psubw     m1, m3
+
+    vinserti128 m2, m3, [mv-4+4*8*2], 0
+    vbroadcasti128  m4, [mv+4*8* 2]
+    vinserti128 m3, m4, [mv-4+4*8*3], 0
+    psubw     m2, m4
+    vbroadcasti128  m4, [mv+4*8* 3]
+    psubw     m3, m4
+    packsswb  m0, m1
+    packsswb  m2, m3
+    pabsb     m0, m0
+    pabsb     m2, m2
+    psubusb   m0, m6
+    psubusb   m2, m6
+    packsswb  m0, m2
+    por       m5, m0
+
+    add       r1, 40
+    add       r2, 4*8*5
+    dec      r5d
+    jge .lists
+
+    ; Check nnz
+    LOAD_BYTES_YMM nnz
+    por       m0, m1
+    mova      m6, [pb_1]
+    pminub    m0, m6
+    pminub    m5, m6 ; mv ? 1 : 0
+    paddb     m0, m0 ; nnz ? 2 : 0
+    pmaxub    m5, m0
+    vextracti128 [bs1], m5, 1
+    pshufb   xm5, [transpose_shuf]
+    mova   [bs0], xm5
+    RET
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ba363ef..c8ef06f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1689,8 +1689,8 @@ static int check_deblock( int cpu_ref, int cpu_new )
             ALIGNED_ARRAY_16( uint8_t, nnz, [X264_SCAN8_SIZE] );
             ALIGNED_4( int8_t ref[2][X264_SCAN8_LUMA_SIZE] );
             ALIGNED_ARRAY_16( int16_t, mv, [2],[X264_SCAN8_LUMA_SIZE][2] );
-            ALIGNED_ARRAY_16( uint8_t, bs, [2],[2][8][4] );
-            memset( bs, 99, sizeof(bs) );
+            ALIGNED_ARRAY_N( uint8_t, bs, [2],[2][8][4] );
+            memset( bs, 99, sizeof(uint8_t)*2*4*8*2 );
             for( int j = 0; j < X264_SCAN8_SIZE; j++ )
                 nnz[j] = ((rand()&7) == 7) * rand() & 0xf;
             for( int j = 0; j < 2; j++ )
@@ -1703,7 +1703,7 @@ static int check_deblock( int cpu_ref, int cpu_new )
             set_func_name( "deblock_strength" );
             call_c( db_c.deblock_strength, nnz, ref, mv, bs[0], 2<<(i&1), ((i>>1)&1) );
             call_a( db_a.deblock_strength, nnz, ref, mv, bs[1], 2<<(i&1), ((i>>1)&1) );
-            if( memcmp( bs[0], bs[1], sizeof(bs[0]) ) )
+            if( memcmp( bs[0], bs[1], sizeof(uint8_t)*2*4*8 ) )
             {
                 ok = 0;
                 fprintf( stderr, "deblock_strength: [FAILED]\n" );



More information about the x264-devel mailing list