[x265] [PATCH 071 of 307] x86: AVX512 addAvg_48x64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:09 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501589225 -19800
# Tue Aug 01 17:37:05 2017 +0530
# Node ID aac415b7223acced7fc844c4a07225704b811df0
# Parent ad756cf6d35f0d1460c5a079bea8781ffd67b7c7
x86: AVX512 addAvg_48x64 for high bit depth
AVX2 performance: 10.61x
AVX512 performance: 13.18x
diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530
@@ -2276,6 +2276,7 @@
p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512);
p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512);
p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512);
+ p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512);
diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Aug 07 16:30:18 2017 +0530
+++ b/source/common/x86/mc-a.asm Tue Aug 01 17:37:05 2017 +0530
@@ -1812,6 +1812,79 @@
movu [r2 + r8 + mmsize], m0
%endmacro
+%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0
+ movu m0, [r0]
+ movu m1, [r1]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2], m0
+
+ movu ym0, [r0 + mmsize]
+ movu ym1, [r1 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + mmsize], ym0
+
+ movu m0, [r0 + r3]
+ movu m1, [r1 + r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r5], m0
+
+ movu ym0, [r0 + r3 + mmsize]
+ movu ym1, [r1 + r4 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + r5 + mmsize], ym0
+
+ movu m0, [r0 + 2 * r3]
+ movu m1, [r1 + 2 * r4]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + 2 * r5], m0
+
+ movu ym0, [r0 + 2 * r3 + mmsize]
+ movu ym1, [r1 + 2 * r4 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + 2 * r5 + mmsize], ym0
+
+ movu m0, [r0 + r6]
+ movu m1, [r1 + r7]
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+ movu [r2 + r8], m0
+
+ movu ym0, [r0 + r6 + mmsize]
+ movu ym1, [r1 + r7 + mmsize]
+ paddw ym0, ym1
+ pmulhrsw ym0, ym3
+ paddw ym0, ym4
+ pmaxsw ym0, ym2
+ pminsw ym0, ym5
+ movu [r2 + r8 + mmsize], ym0
+%endmacro
;-----------------------------------------------------------------------------
;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
;-----------------------------------------------------------------------------
@@ -1874,6 +1947,28 @@
ADDAVG_W64_HBD_AVX512 32
ADDAVG_W64_HBD_AVX512 48
ADDAVG_W64_HBD_AVX512 64
+
+INIT_ZMM avx512
+cglobal addAvg_48x64, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep 15
+ PROCESS_ADDAVG_48x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_48x4_HBD_AVX512
+ RET
;-----------------------------------------------------------------------------
;addAvg avx512 high bit depth code end
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list