[x265] [PATCH 064 of 307] x86: AVX512 pixel_avg_weight_48x64 for high bit depth

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:02 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501593743 -19800
#      Tue Aug 01 18:52:23 2017 +0530
# Node ID 200e6c43adc0c77e588a44d734e7d340e4753ccd
# Parent  fabc3475654f222b469c57b6cf8fd41b334d71be
x86: AVX512 pixel_avg_weight_48x64 for high bit depth

AVX2 performance:   11.84x
AVX512 performance: 17.79x

diff -r fabc3475654f -r 200e6c43adc0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Aug 01 18:42:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 01 18:52:23 2017 +0530
@@ -2305,6 +2305,7 @@
         p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
         p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512);
         p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512);
 
     }
 }
diff -r fabc3475654f -r 200e6c43adc0 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Tue Aug 01 18:42:25 2017 +0530
+++ b/source/common/x86/mc-a.asm	Tue Aug 01 18:52:23 2017 +0530
@@ -5754,6 +5754,84 @@
     movu    [r0 + r8 + mmsize], m2
 %endmacro
 
+%macro PROCESS_PIXELAVG_48x8_HBD_AVX512 0
+    movu    m0, [r2]
+    movu    m1, [r4]
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m0, m1
+    pavgw   m2, m3
+    movu    [r0], m0
+    movu    [r0 + r1], m2
+
+    movu    ym0, [r2 + mmsize]
+    movu    ym1, [r4 + mmsize]
+    movu    ym2, [r2 + r3 + mmsize]
+    movu    ym3, [r4 + r5 + mmsize]
+    pavgw   ym0, ym1
+    pavgw   ym2, ym3
+    movu    [r0 + mmsize], ym0
+    movu    [r0 + r1 + mmsize], ym2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m0, m1
+    pavgw   m2, m3
+    movu    [r0 + r1 * 2], m0
+    movu    [r0 + r8], m2
+
+    movu    ym0, [r2 + r3 * 2 + mmsize]
+    movu    ym1, [r4 + r5 * 2 + mmsize]
+    movu    ym2, [r2 + r6 + mmsize]
+    movu    ym3, [r4 + r7 + mmsize]
+    pavgw   ym0, ym1
+    pavgw   ym2, ym3
+    movu    [r0 + r1 * 2 + mmsize], ym0
+    movu    [r0 + r8 + mmsize], ym2
+
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+
+    movu    m0, [r2]
+    movu    m1, [r4]
+    movu    m2, [r2 + r3]
+    movu    m3, [r4 + r5]
+    pavgw   m0, m1
+    pavgw   m2, m3
+    movu    [r0], m0
+    movu    [r0 + r1], m2
+
+    movu    ym0, [r2 + mmsize]
+    movu    ym1, [r4 + mmsize]
+    movu    ym2, [r2 + r3 + mmsize]
+    movu    ym3, [r4 + r5 + mmsize]
+    pavgw   ym0, ym1
+    pavgw   ym2, ym3
+    movu    [r0 + mmsize], ym0
+    movu    [r0 + r1 + mmsize], ym2
+
+    movu    m0, [r2 + r3 * 2]
+    movu    m1, [r4 + r5 * 2]
+    movu    m2, [r2 + r6]
+    movu    m3, [r4 + r7]
+    pavgw   m0, m1
+    pavgw   m2, m3
+    movu    [r0 + r1 * 2], m0
+    movu    [r0 + r8], m2
+
+    movu    ym0, [r2 + r3 * 2 + mmsize]
+    movu    ym1, [r4 + r5 * 2 + mmsize]
+    movu    ym2, [r2 + r6 + mmsize]
+    movu    ym3, [r4 + r7 + mmsize]
+    pavgw   ym0, ym1
+    pavgw   ym2, ym3
+    movu    [r0 + r1 * 2 + mmsize], ym0
+    movu    [r0 + r8 + mmsize], ym2
+%endmacro
+
 %macro PIXEL_AVG_HBD_W32 1
 INIT_ZMM avx512
 cglobal pixel_avg_32x%1, 6,9,4
@@ -5804,6 +5882,24 @@
 PIXEL_AVG_HBD_W64 32
 PIXEL_AVG_HBD_W64 48
 PIXEL_AVG_HBD_W64 64
+
+INIT_ZMM avx512
+cglobal pixel_avg_48x64, 6,9,4
+    add     r1d, r1d
+    add     r3d, r3d
+    add     r5d, r5d
+    lea     r6, [r3 * 3]
+    lea     r7, [r5 * 3]
+    lea     r8, [r1 * 3]
+
+%rep 7
+    PROCESS_PIXELAVG_48x8_HBD_AVX512
+    lea     r0, [r0 + 4 * r1]
+    lea     r2, [r2 + 4 * r3]
+    lea     r4, [r4 + 4 * r5]
+%endrep
+    PROCESS_PIXELAVG_48x8_HBD_AVX512
+    RET
 ;-----------------------------------------------------------------------------
 ;pixel_avg_pp avx512 high bit depth code end
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list