[x265] [PATCH 016 of 307] x86: AVX512 addAvg_W64

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:14 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499410957 -19800
#      Fri Jul 07 12:32:37 2017 +0530
# Node ID abdbd144d5b4cdfc7f84b540d713147d9b5143fc
# Parent  cc3a93869b28b7d5b3478a2524d07e7e630a0eca
x86: AVX512 addAvg_W64

 Size    | AVX2 performance |   AVX512 performance
--------------------------------------------------
64x16    |      14.46x      |       22.25x
64x32    |      13.93x      |       23.96x
64x48    |      13.90x      |       24.27x
64x64    |      14.74x      |       24.31x

diff -r cc3a93869b28 -r abdbd144d5b4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jul 07 12:32:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jul 07 12:32:37 2017 +0530
@@ -3790,6 +3790,11 @@
         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
 
+        p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512);
+        p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512);
+        p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
+        p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
+
     }
 #endif
 }
diff -r cc3a93869b28 -r abdbd144d5b4 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Fri Jul 07 12:32:37 2017 +0530
+++ b/source/common/x86/mc-a.asm	Fri Jul 07 12:32:37 2017 +0530
@@ -2951,6 +2951,65 @@
 ADDAVG_W64_H2_AVX512 48
 ADDAVG_W64_H2_AVX512 64
 
+%macro ADDAVG_W64_H2_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+    vbroadcasti32x8 m4, [pw_256]
+    vbroadcasti32x8 m5, [pw_128]
+    add             r3, r3
+    add             r4, r4
+    mov             r6d, %1/16
+
+.loop:
+%rep 8
+    movu            m0, [r0]
+    movu            m1, [r1]
+    movu            m2, [r0 + 64]
+    movu            m3, [r1 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+    paddw           m2, m3
+    pmulhrsw        m2, m4
+    paddw           m2, m5
+
+    packuswb        m0, m2
+    vpermq          m0, m0, 11011000b
+    vshufi64x2      m0, m0, 11011000b
+    movu            [r2], m0
+
+
+    movu            m0, [r0 + r3]
+    movu            m1, [r1 + r4]
+    movu            m2, [r0 + r3 + 64]
+    movu            m3, [r1 + r4 + 64]
+    paddw           m0, m1
+    pmulhrsw        m0, m4
+    paddw           m0, m5
+    paddw           m2, m3
+    pmulhrsw        m2, m4
+    paddw           m2, m5
+
+    packuswb        m0, m2
+    vpermq          m0, m0, 11011000b
+    vshufi64x2      m0, m0, 11011000b
+    movu            [r2 + r5], m0
+
+    lea             r2, [r2 + 2 * r5]
+    lea             r0, [r0 + 2 * r3]
+    lea             r1, [r1 + 2 * r4]
+%endrep
+
+    dec             r6d
+    jnz             .loop
+    RET
+%endmacro
+
+ADDAVG_W64_H2_AVX512 16
+ADDAVG_W64_H2_AVX512 32
+ADDAVG_W64_H2_AVX512 48
+ADDAVG_W64_H2_AVX512 64
+
 %macro ADDAVG_W48_H2_AVX2 1
 INIT_YMM avx2
 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride


More information about the x265-devel mailing list