[x265] [PATCH] asm: avx2 code for intra_ang_16 mode 17, improved over 65% than SSE asm

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Tue Aug 25 07:58:28 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1440482220 -19800
#      Tue Aug 25 11:27:00 2015 +0530
# Node ID 65feb1620237d624296276635b2f658c0b1b1719
# Parent  8a414544bfbf64b119fa6dd2e23cef8cb89d0a54
asm: avx2 code for intra_ang_16 mode 17, improved over 65% than SSE asm

diff -r 8a414544bfbf -r 65feb1620237 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 24 16:25:39 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 25 11:27:00 2015 +0530
@@ -2977,6 +2977,7 @@
         p.cu[BLOCK_16x16].intra_pred[14] = PFX(intra_pred_ang16_14_avx2);
         p.cu[BLOCK_16x16].intra_pred[15] = PFX(intra_pred_ang16_15_avx2);
         p.cu[BLOCK_16x16].intra_pred[16] = PFX(intra_pred_ang16_16_avx2);
+        p.cu[BLOCK_16x16].intra_pred[17] = PFX(intra_pred_ang16_17_avx2);
         p.cu[BLOCK_16x16].intra_pred[25] = PFX(intra_pred_ang16_25_avx2);
         p.cu[BLOCK_16x16].intra_pred[28] = PFX(intra_pred_ang16_28_avx2);
         p.cu[BLOCK_16x16].intra_pred[27] = PFX(intra_pred_ang16_27_avx2);
diff -r 8a414544bfbf -r 65feb1620237 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Aug 24 16:25:39 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Aug 25 11:27:00 2015 +0530
@@ -500,6 +500,13 @@
 const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
                      db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
 
+const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
+                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
+                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
+
+const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
+                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
+
 const ang_table
 %assign x 0
 %rep 32
@@ -15195,6 +15202,119 @@
     RET
 
 INIT_YMM avx2
+cglobal intra_pred_ang16_17, 3,4,9
+    vbroadcasti128    m0, [angHor_tab_17]
+    vbroadcasti128    m1, [angHor_tab_17 + mmsize/2]
+    mova              m2, [pw_1024]
+    mova              m7, [ang16_shuf_mode17]
+    mova              m8, [ang16_shuf_mode17 + mmsize]
+    lea               r3, [r1 * 3]
+
+    vbroadcasti128    m3, [r2 + mmsize + 1]
+    vbroadcasti128    m4, [r2]
+    pshufb            m4, [ang16_shuf_mode17 + mmsize * 2]
+    palignr           m3, m4, 3
+    vbroadcasti128    m6, [r2 + mmsize + 4]
+
+    pshufb            m4, m3, m7
+    pshufb            m5, m3, m8
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0], xm4
+    vextracti128      [r0 + r1], m4, 1
+
+    palignr           m5, m6, m3, 2
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0 + r1 * 2], xm4
+    vextracti128      [r0 + r3], m4, 1
+    lea               r0, [r0 + r1 * 4]
+
+    palignr           m5, m6, m3, 4
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0], xm4
+    vextracti128      [r0 + r1], m4, 1
+
+    palignr           m5, m6, m3, 6
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0 + r1 * 2], xm4
+    vextracti128      [r0 + r3], m4, 1
+    lea               r0, [r0 + r1 * 4]
+
+    palignr           m5, m6, m3, 8
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0], xm4
+    vextracti128      [r0 + r1], m4, 1
+
+    palignr           m5, m6, m3, 10
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0 + r1 * 2], xm4
+    vextracti128      [r0 + r3], m4, 1
+    lea               r0, [r0 + r1 * 4]
+
+    palignr           m5, m6, m3, 12
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0], xm4
+    vextracti128      [r0 + r1], m4, 1
+
+    palignr           m5, m6, m3, 14
+    pshufb            m4, m5, m7
+    pshufb            m5, m8
+
+    pmaddubsw         m4, m0
+    pmaddubsw         m5, m1
+    pmulhrsw          m4, m2
+    pmulhrsw          m5, m2
+    packuswb          m4, m5
+    movu              [r0 + r1 * 2], xm4
+    vextracti128      [r0 + r3], m4, 1
+    RET
+
+INIT_YMM avx2
 cglobal intra_pred_ang16_11, 3,4,5
     mova                m0, [angHor_tab_11]
     mova                m1, [pw_1024]


More information about the x265-devel mailing list