[x264-devel] x86: AVX-512 zigzag_scan_8x8_field

Henrik Gramner git at videolan.org
Mon May 22 00:03:11 CEST 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Mar 26 11:34:18 2017 +0200| [edb22f57ba03718c1cb9781ba005aec20a1e50e0] | committer: Henrik Gramner

x86: AVX-512 zigzag_scan_8x8_field

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=edb22f57ba03718c1cb9781ba005aec20a1e50e0
---

 common/dct.c         |  2 ++
 common/x86/dct-a.asm | 36 ++++++++++++++++++++++++++++++++++++
 common/x86/dct.h     |  9 +++++----
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/common/dct.c b/common/dct.c
index 1be89350..cd263c92 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -990,6 +990,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
     {
         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_avx512;
         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_avx512;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
     }
 #endif // HAVE_MMX
@@ -1036,6 +1037,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
     {
         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_avx512;
         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx512;
+        pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_avx512;
         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx512;
     }
 #endif // HAVE_MMX
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 5a4f316a..ec696f49 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -38,11 +38,19 @@ scan_frame_avx512: dd 0x00bf0200, 0x00fd7484, 0x0033a611, 0x0069d822 ; bits 0-3:
                    dd 0x0072f6a6, 0x003c8433, 0x007e5247, 0x00b6a0ba ; bits 9-13:  8x8_frame2
                    dd 0x00ecf12d, 0x00f3239e, 0x00b9540b, 0x00ff868f ; bits 14-18: 8x8_frame3
                                                                      ; bits 19-23: 8x8_frame4
+scan_field_avx512: dd 0x0006b240, 0x000735a1, 0x0007b9c2, 0x0009bde8 ; bits 0-4:   8x8_field1
+                   dd 0x000c4e69, 0x000ce723, 0x000a0004, 0x000aeb4a ; bits 5-9:   8x8_field2
+                   dd 0x000b5290, 0x000bd6ab, 0x000d5ac5, 0x000ddee6 ; bits 10-14: 8x8_field3
+                   dd 0x000e6f67, 0x000e842c, 0x000f0911, 0x000ff058 ; bits 15-19: 8x8_field4
 %else
 scan_frame_avx512: dw 0x7000, 0x5484, 0x3811, 0x1c22, 0x3c95, 0x5908, 0x758c, 0x9119 ; bits 0-3:   4x4_frame
                    dw 0xaca6, 0xc833, 0xe447, 0xe8ba, 0xcd2d, 0xb19e, 0x960b, 0x7a8f ; bits 4-9:   8x8_frame1
                    dw 0x5e10, 0x7da0, 0x9930, 0xb4c0, 0xd050, 0xec60, 0xf0d0, 0xd540 ; bits 10-15: 8x8_frame2
                    dw 0xb9b0, 0x9e20, 0xbe90, 0xdb00, 0xf780, 0xfb10, 0xdea0, 0xfe30
+scan_field_avx512: dw 0x0700, 0x0741, 0x0782, 0x07c8, 0x08c9, 0x0a43, 0x0c04, 0x0a8a ; bits 0-5:   8x8_field1
+                   dw 0x0910, 0x094b, 0x0985, 0x09c6, 0x0ac7, 0x0c4c, 0x0c91, 0x0b18 ; bits 6-11:  8x8_field2
+                   dw 0x0b52, 0x0b8d, 0x0bce, 0x0ccf, 0x0e13, 0x0e59, 0x0d20, 0x0d5a
+                   dw 0x0d94, 0x0dd5, 0x0e96, 0x0ed7, 0x0f1b, 0x0f61, 0x0fa8, 0x0fe2
 %endif
 
 pw_ppmmmmpp:    dw 1,1,-1,-1,-1,-1,1,1
@@ -1943,6 +1951,29 @@ cglobal zigzag_scan_8x8_frame, 2,2
     vmovdqa64   m2 {k2}, m3
     mova [r0+3*64], m2
     RET
+
+cglobal zigzag_scan_8x8_field, 2,2
+    mova        m0, [scan_field_avx512]
+    mova        m1, [r1+0*64]
+    mova        m2, [r1+1*64]
+    mova        m3, [r1+2*64]
+    mova        m4, [r1+3*64]
+    mov        r1d, 0x3f
+    kmovb       k1, r1d
+    psrld       m5, m0, 5
+    vpermi2d    m0, m1, m2
+    vmovdqa64   m1 {k1}, m3 ; 32 33 34 35 36 37 38 39 40 41 42 43 12 13 14 15
+    vpermt2d    m1, m5, m2
+    psrld       m5, 5
+    vmovdqa64   m2 {k1}, m4 ; 48 49 50 51 52 53 54 55 56 57 58 59 28 29 30 31
+    vpermt2d    m2, m5, m3
+    psrld       m5, 5
+    vpermt2d    m3, m5, m4
+    mova [r0+0*64], m0
+    mova [r0+1*64], m1
+    mova [r0+2*64], m2
+    mova [r0+3*64], m3
+    RET
 %else ; !HIGH_BIT_DEPTH
 INIT_YMM avx512
 cglobal zigzag_scan_4x4_frame, 2,2
@@ -1961,6 +1992,7 @@ cglobal zigzag_scan_4x4_field, 2,2
 INIT_ZMM avx512
 cglobal zigzag_scan_8x8_frame, 2,2
     psrlw       m0, [scan_frame_avx512], 4
+scan8_avx512:
     mova        m1, [r1]
     mova        m2, [r1+64]
     psrlw       m3, m0, 6
@@ -1969,4 +2001,8 @@ cglobal zigzag_scan_8x8_frame, 2,2
     mova      [r0], m0
     mova   [r0+64], m1
     RET
+
+cglobal zigzag_scan_8x8_field, 2,2
+    mova        m0, [scan_field_avx512]
+    jmp scan8_avx512
 %endif ; !HIGH_BIT_DEPTH
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 1a5c75c4..a11a6dcd 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -116,10 +116,11 @@ void x264_zigzag_scan_4x4_frame_avx512( dctcoef level[16], dctcoef dct[16] );
 void x264_zigzag_scan_4x4_field_sse   ( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_field_sse2  ( int32_t level[16], int32_t dct[16] );
 void x264_zigzag_scan_4x4_field_avx512( dctcoef level[16], dctcoef dct[16] );
-void x264_zigzag_scan_8x8_field_xop  ( int16_t level[64], int16_t dct[64] );
-void x264_zigzag_scan_8x8_field_avx  ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_sse4 ( int32_t level[64], int32_t dct[64] );
-void x264_zigzag_scan_8x8_field_mmx2 ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_mmx2  ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_sse4  ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_avx   ( int32_t level[64], int32_t dct[64] );
+void x264_zigzag_scan_8x8_field_xop   ( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_avx512( dctcoef level[64], dctcoef dct[64] );
 int  x264_zigzag_sub_4x4_frame_avx    ( int16_t level[16], const uint8_t *src, uint8_t *dst );
 int  x264_zigzag_sub_4x4_frame_ssse3  ( int16_t level[16], const uint8_t *src, uint8_t *dst );
 int  x264_zigzag_sub_4x4ac_frame_avx  ( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );



More information about the x264-devel mailing list