[x265] [PATCH] asm: avx2 code for intra_pred_ang32x32 mode 2, 3, 33 & 34

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jun 9 10:32:13 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1433833289 -19800
#      Tue Jun 09 12:31:29 2015 +0530
# Node ID 05df60c805d3f423db573885eb9f27b17dbc12a7
# Parent  b252468dde7ffca57da27575388d95ce538945d2
asm: avx2 code for intra_pred_ang32x32 mode 2,3,33 & 34

performance improvement over SSE:
intra_ang_32x32[ 2]    1382c->669c, 51%
intra_ang_32x32[ 3]    9887c->4088c, 58%
intra_ang_32x32[33]    7021c->2773c, 60%
intra_ang_32x32[34]    1278c->670c, 47%

diff -r b252468dde7f -r 05df60c805d3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 09 12:31:29 2015 +0530
@@ -1256,6 +1256,11 @@
         p.cu[BLOCK_16x16].intra_pred[33]    = x265_intra_pred_ang16_33_avx2;
         p.cu[BLOCK_16x16].intra_pred[34]    = x265_intra_pred_ang16_2_avx2;
 
+        p.cu[BLOCK_32x32].intra_pred[2]     = x265_intra_pred_ang32_2_avx2;
+        p.cu[BLOCK_32x32].intra_pred[3]     = x265_intra_pred_ang32_3_avx2;
+        p.cu[BLOCK_32x32].intra_pred[33]    = x265_intra_pred_ang32_33_avx2;
+        p.cu[BLOCK_32x32].intra_pred[34]    = x265_intra_pred_ang32_2_avx2;
+
         p.pu[LUMA_8x4].addAvg   = x265_addAvg_8x4_avx2;
         p.pu[LUMA_8x8].addAvg   = x265_addAvg_8x8_avx2;
         p.pu[LUMA_8x16].addAvg  = x265_addAvg_8x16_avx2;
diff -r b252468dde7f -r 05df60c805d3 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/intrapred.h	Tue Jun 09 12:31:29 2015 +0530
@@ -276,6 +276,7 @@
 void x265_intra_pred_ang16_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_34_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_2_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_3_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_26_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r b252468dde7f -r 05df60c805d3 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Jun 09 12:31:29 2015 +0530
@@ -12578,16 +12578,6 @@
     call        ang16_mode_3_33
     RET
 
-cglobal intra_pred_ang32_3, 3,7,13
-    add         r2,        128
-    xor         r6d,       r6d
-    lea         r3,        [ang_table_avx2 + 16 * 32]
-    add         r1d,       r1d
-    lea         r4,        [r1 * 3]
-
-    call        ang16_mode_3_33
-    RET
-
 cglobal intra_pred_ang16_33, 3,7,13
     xor         r6d,       r6d
     inc         r6d
@@ -13146,6 +13136,218 @@
 ; end of avx2 code for intra_pred_ang16 mode 2 to 34
 ;-------------------------------------------------------------------------------------------------------
 
+;-------------------------------------------------------------------------------------------------------
+; avx2 code for intra_pred_ang32 mode 2 to 34 start
+;-------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal intra_pred_ang32_2, 3,5,6
+    lea         r4,                 [r2]
+    add         r2,                 128
+    cmp         r3m,                byte 34
+    cmove       r2,                 r4
+    add         r1d,                 r1d
+    lea         r3,                 [r1 * 3]
+    movu        m0,                 [r2 + 4]
+    movu        m1,                 [r2 + 20]
+    movu        m3,                 [r2 + 36]
+    movu        m4,                 [r2 + 52]
+
+    movu        [r0],               m0
+    movu        [r0 + 32],          m3
+    palignr     m2,                 m1, m0, 2
+    palignr     m5,                 m4, m3, 2
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m1, m0, 4
+    palignr     m5,                 m4, m3, 4
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m1, m0, 6
+    palignr     m5,                 m4, m3, 6
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m1, m0, 8
+    palignr     m5,                 m4, m3, 8
+    movu        [r0],               m2
+    movu        [r0 + 32],          m5
+    palignr     m2,                 m1, m0, 10
+    palignr     m5,                 m4, m3, 10
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m1, m0, 12
+    palignr     m5,                 m4, m3, 12
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m1, m0, 14
+    palignr     m5,                 m4, m3, 14
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    movu        m0,                 [r2 + 36]
+    movu        m3,                 [r2 + 68]
+    lea         r0,                 [r0 + r1 * 4]
+    movu        [r0],               m1
+    movu        [r0 + 32],          m4
+    palignr     m2,                 m0, m1, 2
+    palignr     m5,                 m3, m4, 2
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m0, m1, 4
+    palignr     m5,                 m3, m4, 4
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m0, m1, 6
+    palignr     m5,                 m3, m4, 6
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m0, m1, 8
+    palignr     m5,                 m3, m4, 8
+    movu        [r0],               m2
+    movu        [r0 + 32],          m5
+    palignr     m2,                 m0, m1, 10
+    palignr     m5,                 m3, m4, 10
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m0, m1, 12
+    palignr     m5,                 m3, m4, 12
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m0, m1, 14
+    palignr     m5,                 m3, m4, 14
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    lea         r0,                 [r0 + r1 * 4]
+    movu        m1,                 [r2 + 52]
+    movu        m4,                 [r2 + 84]
+
+    movu        [r0],               m0
+    movu        [r0 + 32],          m3
+    palignr     m2,                 m1, m0, 2
+    palignr     m5,                 m4, m3, 2
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m1, m0, 4
+    palignr     m5,                 m4, m3, 4
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m1, m0, 6
+    palignr     m5,                 m4, m3, 6
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m1, m0, 8
+    palignr     m5,                 m4, m3, 8
+    movu        [r0],               m2
+    movu        [r0 + 32],          m5
+    palignr     m2,                 m1, m0, 10
+    palignr     m5,                 m4, m3, 10
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m1, m0, 12
+    palignr     m5,                 m4, m3, 12
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m1, m0, 14
+    palignr     m5,                 m4, m3, 14
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    movu        m0,                 [r2 + 68]
+    movu        m3,                 [r2 + 100]
+    lea         r0,                 [r0 + r1 * 4]
+    movu        [r0],               m1
+    movu        [r0 + 32],          m4
+    palignr     m2,                 m0, m1, 2
+    palignr     m5,                 m3, m4, 2
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m0, m1, 4
+    palignr     m5,                 m3, m4, 4
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m0, m1, 6
+    palignr     m5,                 m3, m4, 6
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m0, m1, 8
+    palignr     m5,                 m3, m4, 8
+    movu        [r0],               m2
+    movu        [r0 + 32],          m5
+    palignr     m2,                 m0, m1, 10
+    palignr     m5,                 m3, m4, 10
+    movu        [r0 + r1],          m2
+    movu        [r0 + r1 + 32],     m5
+    palignr     m2,                 m0, m1, 12
+    palignr     m5,                 m3, m4, 12
+    movu        [r0 + r1 * 2],      m2
+    movu        [r0 + r1 * 2 + 32], m5
+    palignr     m2,                 m0, m1, 14
+    palignr     m5,                 m3, m4, 14
+    movu        [r0 + r3],          m2
+    movu        [r0 + r3 + 32],     m5
+    RET
+
+cglobal intra_pred_ang32_3, 3,8,13
+    add         r2,        128
+    xor         r6d,       r6d
+    lea         r3,        [ang_table_avx2 + 16 * 32]
+    add         r1d,       r1d
+    lea         r4,        [r1 * 3]
+    lea         r7,        [r0 + 8 * r1]
+
+    call        ang16_mode_3_33
+
+    add         r2,        26
+    lea         r0,        [r0 + 32]
+
+    call        ang16_mode_3_33
+
+    add         r2,        6
+    lea         r0,        [r7 + 8 * r1]
+
+    call        ang16_mode_3_33
+
+    add         r2,        26
+    lea         r0,        [r0 + 32]
+
+    call        ang16_mode_3_33
+    RET
+
+cglobal intra_pred_ang32_33, 3,7,13
+    xor         r6d,       r6d
+    inc         r6d
+    lea         r3,        [ang_table_avx2 + 16 * 32]
+    add         r1d,       r1d
+    lea         r4,        [r1 * 3]
+    lea         r5,        [r0 + 32]
+
+    call        ang16_mode_3_33
+
+    add         r2,        26
+
+    call        ang16_mode_3_33
+
+    add         r2,        6
+    mov         r0,        r5
+
+    call        ang16_mode_3_33
+
+    add         r2,        26
+
+    call        ang16_mode_3_33
+    RET
+;-------------------------------------------------------------------------------------------------------
+; end of avx2 code for intra_pred_ang32 mode 2 to 34
+;-------------------------------------------------------------------------------------------------------
+
 %macro MODE_2_34 0
     movu            m0, [r2 + 4]
     movu            m1, [r2 + 20]


More information about the x265-devel mailing list