[x265] [PATCH 252 of 307] x86: AVX512 intra_pred_dc32 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:10 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1512723573 -19800
#      Fri Dec 08 14:29:33 2017 +0530
# Node ID ddd64f4b2ff382d05e86708750b20332ed93f3c9
# Parent  fa954ed4a1e7ce2741f3cac14006f78c3199191b
x86: AVX512 intra_pred_dc32 for high bit depth

AVX2 performance   : 15.53x
AVX512 performance : 23.96x

diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 08 14:29:33 2017 +0530
@@ -3053,6 +3053,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
+        p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512);
 
     }
 #endif
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred.h	Fri Dec 08 14:29:33 2017 +0530
@@ -76,7 +76,7 @@
 FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
-
+FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
 FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter);
diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Fri Dec 08 12:12:43 2017 +0530
+++ b/source/common/x86/intrapred16.asm	Fri Dec 08 14:29:33 2017 +0530
@@ -688,6 +688,68 @@
     movu            [r0 + r2 * 1 +  0], m0
     movu            [r0 + r2 * 1 + mmsize], m0
     RET
+
+INIT_ZMM avx512
+cglobal intra_pred_dc32, 3,3,17
+    add              r2, 2
+    add             r1d, r1d
+    movu             m16, [r2]
+    movu             m1, [r2 + 2 * mmsize]
+    paddw            m16, m1
+    vextracti32x8   ym1, m16, 1
+    paddw           ym16, ym1
+    vextracti32x4   xm1, m16, 1
+    paddw           xm16, xm1
+    pmaddwd         xm16, [pw_1]
+    movhlps         xm1, xm16
+    paddd           xm16, xm1
+    phaddd          xm16, xm16
+    paddd           xm16, [pd_32]                        ; sum = sum + 32
+    psrld           xm16, 6                              ; sum = sum / 64
+    vpbroadcastw     m0, xm16
+
+    lea              r2, [r1 * 3]
+    ; store DC 32x32
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    lea             r0, [r0 + r1 * 4]
+    movu            [r0 + r1 * 0 +  0], m0
+    movu            [r0 + r1 * 1 +  0], m0
+    movu            [r0 + r1 * 2 +  0], m0
+    movu            [r0 + r2 * 1 +  0], m0
+    RET
 %endif
 
 ;---------------------------------------------------------------------------------------


More information about the x265-devel mailing list