[x265] [PATCH 071 of 307] x86: AVX512 addAvg_48x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:09 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501589225 -19800
#      Tue Aug 01 17:37:05 2017 +0530
# Node ID aac415b7223acced7fc844c4a07225704b811df0
# Parent  ad756cf6d35f0d1460c5a079bea8781ffd67b7c7
x86: AVX512 addAvg_48x64 for high bit depth

AVX2 performance:   10.61x
AVX512 performance: 13.18x

diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 01 17:37:05 2017 +0530
@@ -2276,6 +2276,7 @@
         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512);
         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512);
         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512);
+        p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512);
diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/mc-a.asm	Tue Aug 01 17:37:05 2017 +0530
@@ -1812,6 +1812,79 @@
     movu        [r2 + r8 + mmsize],       m0
 %endmacro
 
+%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0
+    movu        m0,              [r0]
+    movu        m1,              [r1]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        ym0,              [r0 + mmsize]
+    movu        ym1,              [r1 + mmsize]
+    paddw       ym0,              ym1
+    pmulhrsw    ym0,              ym3
+    paddw       ym0,              ym4
+    pmaxsw      ym0,              ym2
+    pminsw      ym0,              ym5
+    movu        [r2 + mmsize],    ym0
+
+    movu        m0,              [r0 + r3]
+    movu        m1,              [r1 + r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        ym0,              [r0 + r3 + mmsize]
+    movu        ym1,              [r1 + r4 + mmsize]
+    paddw       ym0,              ym1
+    pmulhrsw    ym0,              ym3
+    paddw       ym0,              ym4
+    pmaxsw      ym0,              ym2
+    pminsw      ym0,              ym5
+    movu        [r2 + r5 + mmsize],       ym0
+
+    movu        m0,              [r0 + 2 * r3]
+    movu        m1,              [r1 + 2 * r4]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + 2 * r5],   m0
+
+    movu        ym0,              [r0 + 2 * r3 + mmsize]
+    movu        ym1,              [r1 + 2 * r4 + mmsize]
+    paddw       ym0,              ym1
+    pmulhrsw    ym0,              ym3
+    paddw       ym0,              ym4
+    pmaxsw      ym0,              ym2
+    pminsw      ym0,              ym5
+    movu        [r2 + 2 * r5 + mmsize],   ym0
+
+    movu        m0,              [r0 + r6]
+    movu        m1,              [r1 + r7]
+    paddw       m0,              m1
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m2
+    pminsw      m0,              m5
+    movu        [r2 + r8],       m0
+
+    movu        ym0,              [r0 + r6 + mmsize]
+    movu        ym1,              [r1 + r7 + mmsize]
+    paddw       ym0,              ym1
+    pmulhrsw    ym0,              ym3
+    paddw       ym0,              ym4
+    pmaxsw      ym0,              ym2
+    pminsw      ym0,              ym5
+    movu        [r2 + r8 + mmsize],       ym0
+%endmacro
 ;-----------------------------------------------------------------------------
 ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
 ;-----------------------------------------------------------------------------
@@ -1874,6 +1947,28 @@
 ADDAVG_W64_HBD_AVX512 32
 ADDAVG_W64_HBD_AVX512 48
 ADDAVG_W64_HBD_AVX512 64
+
+INIT_ZMM avx512
+cglobal addAvg_48x64, 6,9,6
+    vbroadcasti32x8        m4,              [pw_ %+ ADDAVG_ROUND]
+    vbroadcasti32x8        m5,              [pw_pixel_max]
+    vbroadcasti32x8        m3,              [pw_ %+ ADDAVG_FACTOR]
+    pxor        m2,              m2
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+    lea         r6,              [3 * r3]
+    lea         r7,              [3 * r4]
+    lea         r8,              [3 * r5]
+
+%rep 15
+    PROCESS_ADDAVG_48x4_HBD_AVX512
+    lea         r2,              [r2 + 4 * r5]
+    lea         r0,              [r0 + 4 * r3]
+    lea         r1,              [r1 + 4 * r4]
+%endrep
+    PROCESS_ADDAVG_48x4_HBD_AVX512
+    RET
 ;-----------------------------------------------------------------------------
 ;addAvg avx512 high bit depth code end
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list