[x265] [PATCH 015 of 307] x86: AVX512 addAvg_W64
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:13 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1499410957 -19800
# Fri Jul 07 12:32:37 2017 +0530
# Node ID cc3a93869b28b7d5b3478a2524d07e7e630a0eca
# Parent 7283818f2dd7191c8258030c7424fa6b4ed5330f
x86: AVX512 addAvg_W64
Size | AVX2 performance | AVX512 performance
--------------------------------------------------
64x16 | 14.46x | 22.25x
64x32 | 13.93x | 23.96x
64x48 | 13.90x | 24.27x
64x64 | 14.74x | 24.31x
diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:32:24 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530
@@ -3785,6 +3785,11 @@
p.scale1D_128to64 = PFX(scale1D_128to64_avx512);
+ p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512);
+ p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512);
+ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512);
+ p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512);
+
}
#endif
}
diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Thu Jul 06 17:32:24 2017 +0530
+++ b/source/common/x86/mc-a.asm Fri Jul 07 12:32:37 2017 +0530
@@ -2892,6 +2892,65 @@
ADDAVG_W64_H2_AVX2 48
ADDAVG_W64_H2_AVX2 64
+%macro ADDAVG_W64_H2_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
+ vbroadcasti32x8 m4, [pw_256]
+ vbroadcasti32x8 m5, [pw_128]
+ add r3, r3
+ add r4, r4
+ mov r6d, %1/16
+
+.loop:
+%rep 8
+ movu m0, [r0]
+ movu m1, [r1]
+ movu m2, [r0 + 64]
+ movu m3, [r1 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r2], m0
+
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ movu m2, [r0 + r3 + 64]
+ movu m3, [r1 + r4 + 64]
+ paddw m0, m1
+ pmulhrsw m0, m4
+ paddw m0, m5
+ paddw m2, m3
+ pmulhrsw m2, m4
+ paddw m2, m5
+
+ packuswb m0, m2
+ vpermq m0, m0, 11011000b
+ vshufi64x2 m0, m0, 11011000b
+ movu [r2 + r5], m0
+
+ lea r2, [r2 + 2 * r5]
+ lea r0, [r0 + 2 * r3]
+ lea r1, [r1 + 2 * r4]
+%endrep
+
+ dec r6d
+ jnz .loop
+ RET
+%endmacro
+
+ADDAVG_W64_H2_AVX512 16
+ADDAVG_W64_H2_AVX512 32
+ADDAVG_W64_H2_AVX512 48
+ADDAVG_W64_H2_AVX512 64
+
%macro ADDAVG_W48_H2_AVX2 1
INIT_YMM avx2
cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride
More information about the x265-devel
mailing list