[x265] [PATCH 286 of 307] x86: AVX512 intra_pred_ang32 mode 9 and 27 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:44 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1515575222 -19800
#      Wed Jan 10 14:37:02 2018 +0530
# Node ID 59e596ff83801d7c3e3e01f6d6f64d26b2e8010f
# Parent  a4d60c45fdce6797486f25f5f319615b25bd86f0
x86: AVX512 intra_pred_ang32 mode 9 and 27 for high bit depth
TODO: optimise TRANSPOSE_STORE macro for AVX512 code

AVX2 performance   : 12.63x
AVX512 performance : 16.73x

diff -r a4d60c45fdce -r 59e596ff8380 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jan 10 10:15:39 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jan 10 14:37:02 2018 +0530
@@ -3097,9 +3097,11 @@
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
         p.cu[BLOCK_32x32].intra_pred[2]      = PFX(intra_pred_ang32_2_avx512);
         p.cu[BLOCK_32x32].intra_pred[34]     = PFX(intra_pred_ang32_2_avx512);
+        p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512);
         p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512);
         p.cu[BLOCK_32x32].intra_pred[18]    = PFX(intra_pred_ang32_18_avx512);
         p.cu[BLOCK_32x32].intra_pred[26]    = PFX(intra_pred_ang32_26_avx512);
+        p.cu[BLOCK_32x32].intra_pred[27]    = PFX(intra_pred_ang32_27_avx512);
 
         p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
         p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
diff -r a4d60c45fdce -r 59e596ff8380 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Wed Jan 10 10:15:39 2018 +0530
+++ b/source/common/x86/intrapred16.asm	Wed Jan 10 14:37:02 2018 +0530
@@ -11125,35 +11125,35 @@
 
 %macro TRANSPOSE_STORE_AVX2 11
     jnz             .skip%11
-    punpckhwd       m%9,  m%1,  m%2
-    punpcklwd       m%1,  m%2
-    punpckhwd       m%2,  m%3,  m%4
-    punpcklwd       m%3,  m%4
-
-    punpckldq       m%4,  m%1,  m%3
-    punpckhdq       m%1,  m%3
-    punpckldq       m%3,  m%9,  m%2
-    punpckhdq       m%9,  m%2
-
-    punpckhwd       m%10, m%5,  m%6
-    punpcklwd       m%5,  m%6
-    punpckhwd       m%6,  m%7,  m%8
-    punpcklwd       m%7,  m%8
-
-    punpckldq       m%8,  m%5,  m%7
-    punpckhdq       m%5,  m%7
-    punpckldq       m%7,  m%10, m%6
-    punpckhdq       m%10, m%6
-
-    punpcklqdq      m%6,  m%4,  m%8
-    punpckhqdq      m%2,  m%4,  m%8
-    punpcklqdq      m%4,  m%1,  m%5
-    punpckhqdq      m%8,  m%1,  m%5
-
-    punpcklqdq      m%1,  m%3,  m%7
-    punpckhqdq      m%5,  m%3,  m%7
-    punpcklqdq      m%3,  m%9,  m%10
-    punpckhqdq      m%7,  m%9,  m%10
+    punpckhwd       ym%9,  ym%1,  ym%2
+    punpcklwd       ym%1,  ym%2
+    punpckhwd       ym%2,  ym%3,  ym%4
+    punpcklwd       ym%3,  ym%4
+
+    punpckldq       ym%4,  ym%1,  ym%3
+    punpckhdq       ym%1,  ym%3
+    punpckldq       ym%3,  ym%9,  ym%2
+    punpckhdq       ym%9,  ym%2
+
+    punpckhwd       ym%10, ym%5,  ym%6
+    punpcklwd       ym%5,  ym%6
+    punpckhwd       ym%6,  ym%7,  ym%8
+    punpcklwd       ym%7,  ym%8
+
+    punpckldq       ym%8,  ym%5,  ym%7
+    punpckhdq       ym%5,  ym%7
+    punpckldq       ym%7,  ym%10, ym%6
+    punpckhdq       ym%10, ym%6
+
+    punpcklqdq      ym%6,  ym%4,  ym%8
+    punpckhqdq      ym%2,  ym%4,  ym%8
+    punpcklqdq      ym%4,  ym%1,  ym%5
+    punpckhqdq      ym%8,  ym%1,  ym%5
+
+    punpcklqdq      ym%1,  ym%3,  ym%7
+    punpckhqdq      ym%5,  ym%3,  ym%7
+    punpcklqdq      ym%3,  ym%9,  ym%10
+    punpckhqdq      ym%7,  ym%9,  ym%10
 
     movu            [r0 + r1 * 0 + %11], xm%6
     movu            [r0 + r1 * 1 + %11], xm%2
@@ -11167,28 +11167,28 @@
     movu            [r5 + r4 * 1 + %11], xm%7
 
     lea             r5, [r5 + r1 * 4]
-    vextracti128    [r5 + r1 * 0 + %11], m%6, 1
-    vextracti128    [r5 + r1 * 1 + %11], m%2, 1
-    vextracti128    [r5 + r1 * 2 + %11], m%4, 1
-    vextracti128    [r5 + r4 * 1 + %11], m%8, 1
+    vextracti128    [r5 + r1 * 0 + %11], ym%6, 1
+    vextracti128    [r5 + r1 * 1 + %11], ym%2, 1
+    vextracti128    [r5 + r1 * 2 + %11], ym%4, 1
+    vextracti128    [r5 + r4 * 1 + %11], ym%8, 1
 
     lea             r5, [r5 + r1 * 4]
-    vextracti128    [r5 + r1 * 0 + %11], m%1, 1
-    vextracti128    [r5 + r1 * 1 + %11], m%5, 1
-    vextracti128    [r5 + r1 * 2 + %11], m%3, 1
-    vextracti128    [r5 + r4 * 1 + %11], m%7, 1
+    vextracti128    [r5 + r1 * 0 + %11], ym%1, 1
+    vextracti128    [r5 + r1 * 1 + %11], ym%5, 1
+    vextracti128    [r5 + r1 * 2 + %11], ym%3, 1
+    vextracti128    [r5 + r4 * 1 + %11], ym%7, 1
     jmp             .end%11
 .skip%11:
-    movu            [r0 + r1 * 0], m%1
-    movu            [r0 + r1 * 1], m%2
-    movu            [r0 + r1 * 2], m%3
-    movu            [r0 + r4 * 1], m%4
+    movu            [r0 + r1 * 0], ym%1
+    movu            [r0 + r1 * 1], ym%2
+    movu            [r0 + r1 * 2], ym%3
+    movu            [r0 + r4 * 1], ym%4
 
     lea             r0, [r0 + r1 * 4]
-    movu            [r0 + r1 * 0], m%5
-    movu            [r0 + r1 * 1], m%6
-    movu            [r0 + r1 * 2], m%7
-    movu            [r0 + r4 * 1], m%8
+    movu            [r0 + r1 * 0], ym%5
+    movu            [r0 + r1 * 1], ym%6
+    movu            [r0 + r1 * 2], ym%7
+    movu            [r0 + r4 * 1], ym%8
     lea             r0, [r0 + r1 * 4]
 .end%11:
 %endmacro
@@ -18640,6 +18640,145 @@
     movu        [r0 + r1 * 2],      m0
     movu        [r0 + r2],          m0
     RET
+
+;; angle 16, modes 9 and 27
+cglobal ang16_mode_9_27
+    test            r6d, r6d
+
+    vbroadcasti32x8 m0, [r2 + 2]                    ; [16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
+    vbroadcasti32x8 m1, [r2 + 4]                    ; [17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2]
+
+    punpcklwd       m3, m0, m1                       ; [13 12 12 11 11 10 10  9  5  4  4  3  3  2  2  1]
+    punpckhwd       m0, m1                           ; [17 16 16 15 15 14 14 13  9  8  8  7  7  6  6  5]
+
+    vbroadcasti32x8 m2, [r2 + 18]                    ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10  9]
+    vbroadcasti32x8 m4, [r2 + 20]                    ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10]
+    punpcklwd       m2, m4                           ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10  9]
+
+    movu            ym16, [r3 - 14 * 32]            ; [2]
+    vinserti32x8    m16,  [r3 - 12 * 32], 1         ; [4]
+    pmaddwd         m4, m3, m16
+    paddd           m4, m15
+    psrld           m4, 5
+    pmaddwd         m5, m0, m16
+    paddd           m5, m15
+    psrld           m5, 5
+    packusdw        m4, m5
+    vextracti32x8   ym5, m4, 1
+    movu            ym16, [r3 - 10 * 32]            ; [6]
+    vinserti32x8    m16,  [r3 - 8 * 32], 1          ; [8]
+    pmaddwd         m6, m3, m16
+    paddd           m6, m15
+    psrld           m6, 5
+    pmaddwd         m9, m0, m16
+    paddd           m9, m15
+    psrld           m9, 5
+    packusdw        m6, m9
+    vextracti32x8   ym7, m6, 1
+    movu            ym16, [r3 - 6 * 32]             ; [10]
+    vinserti32x8    m16,  [r3 - 4 * 32], 1          ; [12]
+    pmaddwd         m8, m3, m16
+    paddd           m8, m15
+    psrld           m8, 5
+    pmaddwd         m9, m0, m16
+    paddd           m9, m15
+    psrld           m9, 5
+    packusdw        m8, m9
+    vextracti32x8   ym9, m8, 1
+    movu            ym16, [r3 - 2 * 32]             ; [14]
+    vinserti32x8    m16,  [r3], 1                   ; [16]
+    pmaddwd         m10, m3, m16
+    paddd           m10, m15
+    psrld           m10, 5
+    pmaddwd         m1, m0, m16
+    paddd           m1, m15
+    psrld           m1, 5
+    packusdw        m10, m1
+    vextracti32x8   ym11, m10, 1
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0
+
+    movu            ym16, [r3 + 2 * 32]             ; [18]
+    vinserti32x8    m16,  [r3 + 4 * 32], 1          ; [20]
+    pmaddwd         m4, m3, m16
+    paddd           m4, m15
+    psrld           m4, 5
+    pmaddwd         m5, m0, m16
+    paddd           m5, m15
+    psrld           m5, 5
+    packusdw        m4, m5
+    vextracti32x8   ym5, m4, 1
+    movu            ym16, [r3 + 6 * 32]             ; [22]
+    vinserti32x8    m16,  [r3 + 8 * 32], 1          ; [24]
+    pmaddwd         m6, m3, m16
+    paddd           m6, m15
+    psrld           m6, 5
+    pmaddwd         m8, m0, m16
+    paddd           m8, m15
+    psrld           m8, 5
+    packusdw        m6, m8
+    vextracti32x8   ym7, m6, 1
+    movu            ym16, [r3 + 10 * 32]            ; [26]
+    vinserti32x8    m16,  [r3 + 12 * 32], 1         ; [28]
+    pmaddwd         m8, m3, m16
+    paddd           m8, m15
+    psrld           m8, 5
+    pmaddwd         m9, m0, m16
+    paddd           m9, m15
+    psrld           m9, 5
+    packusdw        m8, m9
+    vextracti32x8   ym9, m8, 1
+    movu            ym16, [r3 + 14 * 32]            ; [30]
+    pmaddwd         ym3, ym16
+    paddd           ym3, ym15
+    psrld           ym3, 5
+    pmaddwd         ym0, ym16
+    paddd           ym0, ym15
+    psrld           ym0, 5
+    packusdw        ym3, ym0
+
+    movu            ym1, [r2 + 4]
+    TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 3, 1, 0, 2, 16
+    ret
+
+cglobal intra_pred_ang32_9, 3,8,17
+    add         r2,        128
+    xor         r6d,       r6d
+    lea         r3,        [ang_table_avx2 + 16 * 32]
+    shl         r1d,       1
+    lea         r4,        [r1 * 3]
+    lea         r7,        [r0 + 8 * r1]
+    vbroadcasti32x8  m15,  [pd_16]
+
+    call        ang16_mode_9_27
+    add         r2,        2
+    lea         r0,        [r0 + 32]
+    call        ang16_mode_9_27
+    add         r2,        30
+    lea         r0,        [r7 + 8 * r1]
+    call        ang16_mode_9_27
+    add         r2,        2
+    lea         r0,        [r0 + 32]
+    call        ang16_mode_9_27
+    RET
+
+cglobal intra_pred_ang32_27, 3,7,17
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table_avx2 + 16 * 32]
+    shl         r1d,       1
+    lea         r4,        [r1 * 3]
+    lea         r5,        [r0 + 32]
+    vbroadcasti32x8  m15,  [pd_16]
+
+    call        ang16_mode_9_27
+    add         r2,        2
+    call        ang16_mode_9_27
+    add         r2,        30
+    mov         r0,        r5
+    call        ang16_mode_9_27
+    add         r2,        2
+    call        ang16_mode_9_27
+    RET
 ;-------------------------------------------------------------------------------------------------------
 ; avx512 code for intra_pred_ang32 mode 2 to 34 end
 ;-------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list