[x264-devel] x86: AVX-512 cabac_block_residual

Henrik Gramner git at videolan.org
Mon May 22 00:04:28 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Fri May 12 00:43:43 2017 +0200| [6151882671b6f9e1ceec2cdb76dd1123c8dc766f] | committer: Henrik Gramner

x86: AVX-512 cabac_block_residual

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6151882671b6f9e1ceec2cdb76dd1123c8dc766f
---

 common/bitstream.c     | 10 ++++++
 common/x86/cabac-a.asm | 82 ++++++++++++++++++++++++++++++--------------------
 2 files changed, 60 insertions(+), 32 deletions(-)

diff --git a/common/bitstream.c b/common/bitstream.c
index 6d3f9c6c..34e643ce 100644
--- a/common/bitstream.c
+++ b/common/bitstream.c
@@ -46,13 +46,16 @@ void x264_cabac_block_residual_rd_internal_sse2       ( dctcoef *l, int b_interl
 void x264_cabac_block_residual_rd_internal_lzcnt      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_rd_internal_avx512     ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_8x8_rd_internal_sse2       ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_8x8_rd_internal_lzcnt      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_8x8_rd_internal_ssse3      ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_8x8_rd_internal_ssse3_lzcnt( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_8x8_rd_internal_avx512     ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_internal_sse2  ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_internal_lzcnt ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 void x264_cabac_block_residual_internal_avx2  ( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
+void x264_cabac_block_residual_internal_avx512( dctcoef *l, int b_interlaced, intptr_t ctx_block_cat, x264_cabac_t *cb );
 
 uint8_t *x264_nal_escape_neon( uint8_t *dst, uint8_t *src, uint8_t *end );
 
@@ -153,6 +156,13 @@ void x264_bitstream_init( int cpu, x264_bitstream_function_t *pf )
         pf->nal_escape = x264_nal_escape_avx2;
         pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx2;
     }
+
+    if( cpu&X264_CPU_AVX512 )
+    {
+        pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_avx512;
+        pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_avx512;
+        pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_avx512;
+    }
 #endif
 #endif
 #if HAVE_ARMV6
diff --git a/common/x86/cabac-a.asm b/common/x86/cabac-a.asm
index 37e33d4a..e2f613cf 100644
--- a/common/x86/cabac-a.asm
+++ b/common/x86/cabac-a.asm
@@ -54,13 +54,17 @@ coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
 
 cextern coeff_last4_mmx2
 cextern coeff_last4_lzcnt
+cextern coeff_last4_avx512
 cextern coeff_last15_sse2
 cextern coeff_last15_lzcnt
+cextern coeff_last15_avx512
 cextern coeff_last16_sse2
 cextern coeff_last16_lzcnt
+cextern coeff_last16_avx512
 cextern coeff_last64_sse2
 cextern coeff_last64_lzcnt
 cextern coeff_last64_avx2
+cextern coeff_last64_avx512
 
 %ifdef PIC
 SECTION .data
@@ -68,6 +72,11 @@ SECTION .data
 coeff_last_sse2:   COEFF_LAST_TABLE mmx2,   sse2,   sse2,   16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
 coeff_last_lzcnt:  COEFF_LAST_TABLE lzcnt,  lzcnt,  lzcnt,  16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
 coeff_last_avx2:   COEFF_LAST_TABLE lzcnt,  avx2,   lzcnt,  16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%if HIGH_BIT_DEPTH
+coeff_last_avx512: COEFF_LAST_TABLE avx512, avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%else
+coeff_last_avx512: COEFF_LAST_TABLE lzcnt,  avx512, avx512, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
+%endif
 %endif
 
 SECTION .text
@@ -352,25 +361,33 @@ CABAC bmi2
 %endmacro
 
 %macro ABS_DCTCOEFS 2
-%assign i 0
-%rep %2/16
 %if HIGH_BIT_DEPTH
-    ABSD   m0, [%1+ 0+i*64], m4
-    ABSD   m1, [%1+16+i*64], m5
-    ABSD   m2, [%1+32+i*64], m4
-    ABSD   m3, [%1+48+i*64], m5
-    mova [rsp+ 0+i*64], m0
-    mova [rsp+16+i*64], m1
-    mova [rsp+32+i*64], m2
-    mova [rsp+48+i*64], m3
+    %define %%abs ABSD
 %else
-    ABSW   m0, [%1+ 0+i*32], m2
-    ABSW   m1, [%1+16+i*32], m3
-    mova [rsp+ 0+i*32], m0
-    mova [rsp+16+i*32], m1
-%endif
+    %define %%abs ABSW
+%endif
+%if mmsize == %2*SIZEOF_DCTCOEF
+    %%abs   m0, [%1], m1
+    mova [rsp], m0
+%elif mmsize == %2*SIZEOF_DCTCOEF/2
+    %%abs   m0, [%1+0*mmsize], m2
+    %%abs   m1, [%1+1*mmsize], m3
+    mova [rsp+0*mmsize], m0
+    mova [rsp+1*mmsize], m1
+%else
+%assign i 0
+%rep %2*SIZEOF_DCTCOEF/(4*mmsize)
+    %%abs  m0, [%1+(4*i+0)*mmsize], m4
+    %%abs  m1, [%1+(4*i+1)*mmsize], m5
+    %%abs  m2, [%1+(4*i+2)*mmsize], m4
+    %%abs  m3, [%1+(4*i+3)*mmsize], m5
+    mova [rsp+(4*i+0)*mmsize], m0
+    mova [rsp+(4*i+1)*mmsize], m1
+    mova [rsp+(4*i+2)*mmsize], m2
+    mova [rsp+(4*i+3)*mmsize], m3
 %assign i i+1
 %endrep
+%endif
 %endmacro
 
 %macro SIG_OFFSET 1
@@ -403,16 +420,14 @@ CABAC bmi2
 %endif
 
 %ifdef PIC
-    cglobal func, 4,13
+    cglobal func, 4,13,6,-maxcoeffs*SIZEOF_DCTCOEF
     lea     r12, [$$]
     %define GLOBAL +r12-$$
 %else
-    cglobal func, 4,12
+    cglobal func, 4,12,6,-maxcoeffs*SIZEOF_DCTCOEF
     %define GLOBAL
 %endif
 
-%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
-    SUB     rsp, pad
     shl     r1d, 4                                            ; MB_INTERLACED*16
 %if %1
     lea      r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]     ; r12 = sig offset 8x8
@@ -429,15 +444,13 @@ CABAC bmi2
     ABS_DCTCOEFS r0, 64
 %else
     mov      r4, r0                                           ; r4 = dct
-    mov      r6, ~SIZEOF_DCTCOEF
-    and      r6, r4                                           ; handle AC coefficient case
-    ABS_DCTCOEFS r6, 16
-    sub      r4, r6                                           ; calculate our new dct pointer
+    and      r4, ~SIZEOF_DCTCOEF                              ; handle AC coefficient case
+    ABS_DCTCOEFS r4, 16
+    xor      r4, r0                                           ; calculate our new dct pointer
     add      r4, rsp                                          ; restore AC coefficient offset
 %endif
-    mov      r1, [%2+gprsize*r2 GLOBAL]
 ; for improved OOE performance, run coeff_last on the original coefficients.
-    call     r1                                               ; coeff_last[ctx_block_cat]( dct )
+    call [%2+gprsize*r2 GLOBAL]                               ; coeff_last[ctx_block_cat]( dct )
 ; we know on 64-bit that the SSE2 versions of this function only
 ; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
 ; don't need r2 in 8x8 mode.
@@ -521,7 +534,6 @@ CABAC bmi2
     jge .coeff_loop
 .end:
     mov [r3+cb.bits_encoded-cb.state], r0d
-    ADD     rsp, pad
     RET
 %endmacro
 
@@ -538,6 +550,14 @@ CABAC_RESIDUAL_RD 1, coeff_last_sse2
 INIT_XMM ssse3,lzcnt
 CABAC_RESIDUAL_RD 0, coeff_last_lzcnt
 CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+%else
+INIT_YMM avx512
+%endif
+CABAC_RESIDUAL_RD 0, coeff_last_avx512
+INIT_ZMM avx512
+CABAC_RESIDUAL_RD 1, coeff_last_avx512
 %endif
 
 ;-----------------------------------------------------------------------------
@@ -615,7 +635,7 @@ CABAC_RESIDUAL_RD 1, coeff_last_lzcnt
 %endmacro
 
 %macro CABAC_RESIDUAL 1
-cglobal cabac_block_residual_internal, 4,15
+cglobal cabac_block_residual_internal, 4,15,0,-4*64
 %ifdef PIC
 ; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
     lea     r7, [$$]
@@ -625,8 +645,6 @@ cglobal cabac_block_residual_internal, 4,15
     %define lastm r7d
     %define GLOBAL
 %endif
-%assign pad gprsize+4*2+4*64-(stack_offset&15)
-    SUB     rsp, pad
     shl     r1d, 4
 
     %define sigoffq r8
@@ -653,8 +671,7 @@ cglobal cabac_block_residual_internal, 4,15
     mov     dct, r0
     mov leveloffm, leveloffd
 
-    mov      r1, [%1+gprsize*r2 GLOBAL]
-    call     r1
+    call [%1+gprsize*r2 GLOBAL]
     mov   lastm, eax
 ; put cabac in r0; needed for cabac_encode_decision
     mov      r0, r3
@@ -742,7 +759,6 @@ cglobal cabac_block_residual_internal, 4,15
 %endif
     dec coeffidxd
     jge .level_loop
-    ADD     rsp, pad
     RET
 %endmacro
 
@@ -753,4 +769,6 @@ INIT_XMM lzcnt
 CABAC_RESIDUAL coeff_last_lzcnt
 INIT_XMM avx2
 CABAC_RESIDUAL coeff_last_avx2
+INIT_XMM avx512
+CABAC_RESIDUAL coeff_last_avx512
 %endif



More information about the x264-devel mailing list