[x265] [PATCH 265 of 307] x86: AVX512 intra_pred_ang32x32 mode 2 and 34 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:23 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1513084512 -19800
#      Tue Dec 12 18:45:12 2017 +0530
# Node ID 3d780e0d48827cd1cc4e664c3bf96dce6f515810
# Parent  0b38182facb66543adfbf9664cda085f7e641327
x86: AVX512 intra_pred_ang32x32 mode 2 and 34 for high bit depth

Primitive           | AVX2 performance | AVX512 performance
-------------------------------------------------------------
intra_ang_32x32[ 2] |     24.45x       |      39.65x
intra_ang_32x32[34] |      3.10x       |       5.26x

diff -r 0b38182facb6 -r 3d780e0d4882 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Dec 14 11:36:42 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 12 18:45:12 2017 +0530
@@ -3059,8 +3059,10 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
-
+        p.cu[BLOCK_32x32].intra_pred[2]      = PFX(intra_pred_ang32_2_avx512);
+        p.cu[BLOCK_32x32].intra_pred[34]     = PFX(intra_pred_ang32_2_avx512);
         p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
         p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
         p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
diff -r 0b38182facb6 -r 3d780e0d4882 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Thu Dec 14 11:36:42 2017 +0530
+++ b/source/common/x86/intrapred.h	Tue Dec 12 18:45:12 2017 +0530
@@ -85,7 +85,7 @@
 DECL_ALL(ssse3);
 DECL_ALL(sse4);
 DECL_ALL(avx2);
-
+DECL_ALL(avx512);
 #undef DECL_ALL
 #undef DECL_ANGS
 #undef DECL_ANG
diff -r 0b38182facb6 -r 3d780e0d4882 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Thu Dec 14 11:36:42 2017 +0530
+++ b/source/common/x86/intrapred16.asm	Tue Dec 12 18:45:12 2017 +0530
@@ -18296,7 +18296,102 @@
 ;-------------------------------------------------------------------------------------------------------
 ; end of avx2 code for intra_pred_ang32 mode 2 to 34
 ;-------------------------------------------------------------------------------------------------------
-
+;-------------------------------------------------------------------------------------------------------
+; avx512 code for intra_pred_ang32 mode 2 to 34 start
+;-------------------------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal intra_pred_ang32_2, 3,5,3
+    lea         r4,                 [r2]
+    add         r2,                 128
+    cmp         r3m,                byte 34
+    cmove       r2,                 r4
+    add         r1d,                 r1d
+    lea         r3,                 [r1 * 3]
+    movu        m0,                 [r2 + 4]
+    movu        m1,                 [r2 + 20]
+
+    movu        [r0],               m0
+    palignr     m2,                 m1, m0, 2
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m1, m0, 4
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m1, m0, 6
+    movu        [r0 + r3],          m2
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m1, m0, 8
+    movu        [r0],               m2
+    palignr     m2,                 m1, m0, 10
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m1, m0, 12
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m1, m0, 14
+    movu        [r0 + r3],          m2
+
+    movu        m0,                 [r2 + 36]
+    lea         r0,                 [r0 + r1 * 4]
+    movu        [r0],               m1
+    palignr     m2,                 m0, m1, 2
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m0, m1, 4
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m0, m1, 6
+    movu        [r0 + r3],          m2
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m0, m1, 8
+    movu        [r0],               m2
+    palignr     m2,                 m0, m1, 10
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m0, m1, 12
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m0, m1, 14
+    movu        [r0 + r3],          m2
+
+    lea         r0,                 [r0 + r1 * 4]
+    movu        m1,                 [r2 + 52]
+
+    movu        [r0],               m0
+    palignr     m2,                 m1, m0, 2
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m1, m0, 4
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m1, m0, 6
+    movu        [r0 + r3],          m2
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m1, m0, 8
+    movu        [r0],               m2
+    palignr     m2,                 m1, m0, 10
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m1, m0, 12
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m1, m0, 14
+    movu        [r0 + r3],          m2
+
+    movu        m0,                 [r2 + 68]
+    lea         r0,                 [r0 + r1 * 4]
+    movu        [r0],               m1
+    palignr     m2,                 m0, m1, 2
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m0, m1, 4
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m0, m1, 6
+    movu        [r0 + r3],          m2
+
+    lea         r0,                 [r0 + r1 * 4]
+    palignr     m2,                 m0, m1, 8
+    movu        [r0],               m2
+    palignr     m2,                 m0, m1, 10
+    movu        [r0 + r1],          m2
+    palignr     m2,                 m0, m1, 12
+    movu        [r0 + r1 * 2],      m2
+    palignr     m2,                 m0, m1, 14
+    movu        [r0 + r3],          m2
+    RET
+;-------------------------------------------------------------------------------------------------------
+; avx512 code for intra_pred_ang32 mode 2 to 34 end
+;-------------------------------------------------------------------------------------------------------
 %macro MODE_2_34 0
     movu            m0, [r2 + 4]
     movu            m1, [r2 + 20]


More information about the x265-devel mailing list