[x264-devel] x86: AVX-512 pixel_sad

Henrik Gramner git at videolan.org
Mon May 22 00:04:20 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun May  7 23:35:49 2017 +0200| [993eb2079e45619098241e14806fc70030968af6] | committer: Henrik Gramner

x86: AVX-512 pixel_sad

Covers all variants: 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=993eb2079e45619098241e14806fc70030968af6
---

 common/pixel.c       |   2 +
 common/x86/pixel.h   |   1 +
 common/x86/sad-a.asm | 129 +++++++++++++++++++++++++++++++++++++++++++++++++++
 encoder/slicetype.c  |   2 +-
 tools/checkasm.c     |   5 +-
 5 files changed, 136 insertions(+), 3 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 020b450b..23e9c7a1 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1348,6 +1348,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
     if( cpu&X264_CPU_AVX512 )
     {
+        INIT8( sad, _avx512 );
+        INIT8_NAME( sad_aligned, sad, _avx512 );
         INIT8( satd, _avx512 );
         pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx512;
         pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx512;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 57229986..9bf71134 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -52,6 +52,7 @@ DECL_X1( sad, sse2_aligned )
 DECL_X1( sad, ssse3 )
 DECL_X1( sad, ssse3_aligned )
 DECL_X1( sad, avx2 )
+DECL_X1( sad, avx512 )
 DECL_X4( sad, mmx2 )
 DECL_X4( sad, sse2 )
 DECL_X4( sad, sse3 )
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 358b9ec1..6b248a4d 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -122,6 +122,9 @@ SAD  4,  4
 ;-----------------------------------------------------------------------------
 %macro SAD_W16 1 ; h
 cglobal pixel_sad_16x%1, 4,4
+%ifidn cpuname, sse2
+.skip_prologue:
+%endif
 %assign %%i 0
 %if ARCH_X86_64
     lea  r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile
@@ -201,6 +204,132 @@ cglobal pixel_sad_8x16_sse2, 4,4
     SAD_INC_4x8P_SSE 1
     SAD_END_SSE2
 
+%macro SAD_W48_AVX512 3 ; w, h, d/q
+cglobal pixel_sad_%1x%2, 4,4
+    kxnorb        k1, k1, k1
+    kaddb         k1, k1, k1
+%assign %%i 0
+%if ARCH_X86_64 && %2 != 4
+    lea           r6, [3*r1]
+    lea           r5, [3*r3]
+%rep %2/4
+    mov%3         m1,      [r0]
+    vpbroadcast%3 m1 {k1}, [r0+r1]
+    mov%3         m3,      [r2]
+    vpbroadcast%3 m3 {k1}, [r2+r3]
+    mov%3         m2,      [r0+2*r1]
+    vpbroadcast%3 m2 {k1}, [r0+r6]
+    mov%3         m4,      [r2+2*r3]
+    vpbroadcast%3 m4 {k1}, [r2+r5]
+%if %%i != %2/4-1
+    lea           r0, [r0+4*r1]
+    lea           r2, [r2+4*r3]
+%endif
+    psadbw        m1, m3
+    psadbw        m2, m4
+    ACCUM      paddd, 0, 1, %%i
+    paddd         m0, m2
+    %assign %%i %%i+1
+%endrep
+%else
+%rep %2/2
+    mov%3         m1,      [r0]
+    vpbroadcast%3 m1 {k1}, [r0+r1]
+    mov%3         m2,      [r2]
+    vpbroadcast%3 m2 {k1}, [r2+r3]
+%if %%i != %2/2-1
+    lea           r0, [r0+2*r1]
+    lea           r2, [r2+2*r3]
+%endif
+    psadbw        m1, m2
+    ACCUM      paddd, 0, 1, %%i
+    %assign %%i %%i+1
+%endrep
+%endif
+%if %1 == 8
+    punpckhqdq    m1, m0, m0
+    paddd         m0, m1
+%endif
+    movd         eax, m0
+    RET
+%endmacro
+
+INIT_XMM avx512
+SAD_W48_AVX512 4,  4, d
+SAD_W48_AVX512 4,  8, d
+SAD_W48_AVX512 4, 16, d
+SAD_W48_AVX512 8,  4, q
+SAD_W48_AVX512 8,  8, q
+SAD_W48_AVX512 8, 16, q
+
+%macro SAD_W16_AVX512_START 1 ; h
+    cmp  r1d, FENC_STRIDE                  ; optimized for the most common fenc case, which
+    jne pixel_sad_16x%1_sse2.skip_prologue ; has the rows laid out contiguously in memory
+    lea   r1, [3*r3]
+%endmacro
+
+%macro SAD_W16_AVX512_END 0
+    paddd          m0, m1
+    paddd          m0, m2
+    paddd          m0, m3
+%if mmsize == 64
+    vextracti32x8 ym1, m0, 1
+    paddd         ym0, ym1
+%endif
+    vextracti128  xm1, ym0, 1
+    paddd        xmm0, xm0, xm1
+    punpckhqdq   xmm1, xmm0, xmm0
+    paddd        xmm0, xmm1
+    movd          eax, xmm0
+    RET
+%endmacro
+
+INIT_YMM avx512
+cglobal pixel_sad_16x8, 4,4
+    SAD_W16_AVX512_START 8
+    movu         xm0, [r2]
+    vinserti128   m0, [r2+r3], 1
+    psadbw        m0, [r0+0*32]
+    movu         xm1, [r2+2*r3]
+    vinserti128   m1, [r2+r1], 1
+    lea           r2, [r2+4*r3]
+    psadbw        m1, [r0+1*32]
+    movu         xm2, [r2]
+    vinserti128   m2, [r2+r3], 1
+    psadbw        m2, [r0+2*32]
+    movu         xm3, [r2+2*r3]
+    vinserti128   m3, [r2+r1], 1
+    psadbw        m3, [r0+3*32]
+    SAD_W16_AVX512_END
+
+INIT_ZMM avx512
+cglobal pixel_sad_16x16, 4,4
+    SAD_W16_AVX512_START 16
+    movu          xm0, [r2]
+    vinserti128   ym0, [r2+r3],   1
+    movu          xm1, [r2+4*r3]
+    vinserti32x4   m0, [r2+2*r3], 2
+    vinserti32x4   m1, [r2+2*r1], 2
+    vinserti32x4   m0, [r2+r1],   3
+    lea            r2, [r2+4*r3]
+    vinserti32x4   m1, [r2+r3],   1
+    psadbw         m0, [r0+0*64]
+    vinserti32x4   m1, [r2+r1],   3
+    lea            r2, [r2+4*r3]
+    psadbw         m1, [r0+1*64]
+    movu          xm2, [r2]
+    vinserti128   ym2, [r2+r3],   1
+    movu          xm3, [r2+4*r3]
+    vinserti32x4   m2, [r2+2*r3], 2
+    vinserti32x4   m3, [r2+2*r1], 2
+    vinserti32x4   m2, [r2+r1],   3
+    lea            r2, [r2+4*r3]
+    vinserti32x4   m3, [r2+r3],   1
+    psadbw         m2, [r0+2*64]
+    vinserti32x4   m3, [r2+r1],   3
+    psadbw         m3, [r0+3*64]
+    SAD_W16_AVX512_END
+
 ;-----------------------------------------------------------------------------
 ; void pixel_vsad( pixel *src, intptr_t stride );
 ;-----------------------------------------------------------------------------
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 7fd61b2e..6c0aaa8c 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -267,7 +267,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
     int i_lines = fenc->i_lines[p];
     int i_width = fenc->i_width[p];
     pixel *src = fenc->plane[p];
-    ALIGNED_ARRAY_16( pixel, buf, [16*16] );
+    ALIGNED_ARRAY_64( pixel, buf, [16*16] );
     int pixoff = 0;
     if( w )
     {
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 2ad9c06a..b367db67 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -370,8 +370,9 @@ static int check_pixel( int cpu_ref, int cpu_new )
             used_asm = 1; \
             for( int j = 0; j < 64; j++ ) \
             { \
-                res_c   = call_c( pixel_c.name[i],   pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
-                res_asm = call_a( pixel_asm.name[i], pbuf1, (intptr_t)16, pbuf2+j*!align, (intptr_t)64 ); \
+                intptr_t stride1 = (j&31) == 31 ? 32 : FENC_STRIDE; \
+                res_c   = call_c( pixel_c.name[i],   pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
+                res_asm = call_a( pixel_asm.name[i], pbuf1, stride1, pbuf2+j*!align, (intptr_t)64 ); \
                 if( res_c != res_asm ) \
                 { \
                     ok = 0; \



More information about the x264-devel mailing list