[x265] [PATCH 5 of 5] asm: optimized intra_ang16 mode 11 avx2 asm, 520c->370c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Aug 18 06:11:39 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1439816850 -19800
#      Mon Aug 17 18:37:30 2015 +0530
# Node ID 6ff0bcad1688f5ee1e393c648739ed2ae7e79b61
# Parent  e75f3a2f1d29f01ca2d71f1b8be970d471b5e1f6
asm: optimized intra_ang16 mode 11 avx2 asm, 520c->370c

diff -r e75f3a2f1d29 -r 6ff0bcad1688 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Aug 17 17:24:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Mon Aug 17 18:37:30 2015 +0530
@@ -425,6 +425,9 @@
 const ang32_shuf_mode11,    times 8 db 1, 2
                             times 8 db 0, 1
 
+const ang16_shuf_mode11,    times 8 db 0, 1
+                            times 8 db 1, 2
+
 const ang_table
 %assign x 0
 %rep 32
@@ -15630,130 +15633,106 @@
     INTRA_PRED_TRANS_STORE_16x16
     RET
 
-
-INIT_YMM avx2
-cglobal intra_pred_ang16_11, 3,4,5
-    mova                m0, [angHor_tab_11]
-    mova                m1, [pw_1024]
+INIT_YMM avx2
+cglobal intra_pred_ang16_11, 3,4,8
+    vbroadcasti128      m0, [angHor_tab_11]
+    vbroadcasti128      m1, [angHor_tab_11 + mmsize/2]
+    mova                m2, [pw_1024]
+    mova                m7, [ang16_shuf_mode11]
     lea                 r3, [r1 * 3]
 
     ; prepare for [0 -1 -2 ...]
-    movu               xm2, [r2 + 32]
     ; TODO: input reference pixel buffer need a duplicate of pixel_lt to avoid reduce instruction in every mode
-    pinsrb             xm2, [r2], 0
-    pshufb             xm2, [intra_pred_shuff_0_8]      ; [0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8]
-
-
-    vpbroadcastw        m3, xm2                         ; word [1 0]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [2 1]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0], xm3
-    vextracti128        [r0 + r1], m3, 1
-
-    vpbroadcastw        m3, xm2                         ; word [3 2]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [4 3]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0 + r1 * 2], xm3
-    vextracti128        [r0 + r3], m3, 1
+    movu               xm3, [r2 + mmsize]
+    pinsrb             xm3, [r2], 0
+    vbroadcasti128      m6, [r2 + mmsize + 16]
+    vinserti128         m3, m3, xm3, 1
+
+    pshufb              m5, m3, m7              ; [ 0  1  0  1  0  1  0  1  0  1  0  1  0  1  0  1  1  2  1  2  1  2  1  2  1  2  1  2  1  2  1  2]
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0], xm4
+    vextracti128        [r0 + r1], m4, 1
+
+    palignr             m5, m6, m3, 2
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0 + r1 * 2], xm4
+    vextracti128        [r0 + r3], m4, 1
+
     lea                 r0, [r0 + r1 * 4]
 
-    vpbroadcastw        m3, xm2                         ; word [5 4]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [6 5]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0], xm3
-    vextracti128        [r0 + r1], m3, 1
-
-    vpbroadcastw        m3, xm2                         ; word [7 6]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [8 7]
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0 + r1 * 2], xm3
-    vextracti128        [r0 + r3], m3, 1
+    palignr             m5, m6, m3, 4
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0], xm4
+    vextracti128        [r0 + r1], m4, 1
+
+    palignr             m5, m6, m3, 6
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0 + r1 * 2], xm4
+    vextracti128        [r0 + r3], m4, 1
+
     lea                 r0, [r0 + r1 * 4]
 
-    ; loading new reference pixels
-    movu               xm2, [r2 + 32 + 8]
-    pshufb             xm2, [intra_pred_shuff_0_8]      ; [8 9 9 A A B B C C D D E E F F 10]
-
-    vpbroadcastw        m3, xm2                         ; word [9 8]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [A 9]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0], xm3
-    vextracti128        [r0 + r1], m3, 1
-
-    vpbroadcastw        m3, xm2                         ; word [B A]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [C B]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0 + r1 * 2], xm3
-    vextracti128        [r0 + r3], m3, 1
+    palignr             m5, m6, m3, 8
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0], xm4
+    vextracti128        [r0 + r1], m4, 1
+
+    palignr             m5, m6, m3, 10
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0 + r1 * 2], xm4
+    vextracti128        [r0 + r3], m4, 1
+
     lea                 r0, [r0 + r1 * 4]
 
-    vpbroadcastw        m3, xm2                         ; word [D C]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [E D]
-    psrldq             xm2, 2
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0], xm3
-    vextracti128        [r0 + r1], m3, 1
-
-    vpbroadcastw        m3, xm2                         ; word [F E]
-    psrldq             xm2, 2
-    vpbroadcastw        m4, xm2                         ; word [10 F]
-    pmaddubsw           m3, m0
-    pmaddubsw           m4, m0
-    pmulhrsw            m3, m1
-    pmulhrsw            m4, m1
-    packuswb            m3, m4
-    vpermq              m3, m3, q3120
-    movu                [r0 + r1 * 2], xm3
-    vextracti128        [r0 + r3], m3, 1
-    RET    
-
+    palignr             m5, m6, m3, 12
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0], xm4
+    vextracti128        [r0 + r1], m4, 1
+
+    palignr             m5, m6, m3, 14
+    pshufb              m5, m7
+    pmaddubsw           m4, m5, m0
+    pmaddubsw           m5, m1
+    pmulhrsw            m4, m2
+    pmulhrsw            m5, m2
+    packuswb            m4, m5
+    movu                [r0 + r1 * 2], xm4
+    vextracti128        [r0 + r3], m4, 1
+    RET
 
 ; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
 %if ARCH_X86_64 == 1


More information about the x265-devel mailing list