[x265] [PATCH 094 of 307] x86: AVX512 addAvg_16xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:32 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503390093 -19800
# Tue Aug 22 13:51:33 2017 +0530
# Node ID 7868f1cb521d554dc77d768ec1f838e0b29824e4
# Parent 738f07186eb1d4bca84e9acdf70921ee9e2fee92
x86: AVX512 addAvg_16xN for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
16x4 | 7.67x | 9.12x
16x8 | 8.97x | 9.14x
16x12 | 9.21x | 9.89x
16x16 | 8.98x | 10.64x
16x32 | 9.18x | 12.66x
16x64 | 8.51x | 12.72x
This patch also cleanup other sizes
diff -r 738f07186eb1 -r 7868f1cb521d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 22 12:40:34 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 22 13:51:33 2017 +0530
@@ -2276,15 +2276,33 @@
p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512);
p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512);
p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512);
+ p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_avx512);
+ p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_avx512);
+ p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_avx512);
+ p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_avx512);
+ p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_avx512);
+ p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_avx512);
p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512);
+
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_avx512);
+
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_avx512);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_avx512);
p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx512);
p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx512);
diff -r 738f07186eb1 -r 7868f1cb521d source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Aug 22 12:40:34 2017 +0530
+++ b/source/common/x86/mc-a.asm Tue Aug 22 13:51:33 2017 +0530
@@ -1657,47 +1657,37 @@
;-----------------------------------------------------------------------------
;addAvg avx512 high bit depth code start
;-----------------------------------------------------------------------------
-%macro PROCESS_ADDAVG_32x8_HBD_AVX512 0
- movu m0, [r0]
- movu m1, [r1]
- paddw m0, m1
- pmulhrsw m0, m3
- paddw m0, m4
- pmaxsw m0, m2
- pminsw m0, m5
- movu [r2], m0
-
- movu m0, [r0 + r3]
- movu m1, [r1 + r4]
- paddw m0, m1
- pmulhrsw m0, m3
- paddw m0, m4
- pmaxsw m0, m2
- pminsw m0, m5
- movu [r2 + r5], m0
-
- movu m0, [r0 + 2 * r3]
- movu m1, [r1 + 2 * r4]
- paddw m0, m1
- pmulhrsw m0, m3
- paddw m0, m4
- pmaxsw m0, m2
- pminsw m0, m5
- movu [r2 + 2 * r5], m0
-
- movu m0, [r0 + r6]
- movu m1, [r1 + r7]
- paddw m0, m1
- pmulhrsw m0, m3
- paddw m0, m4
- pmaxsw m0, m2
- pminsw m0, m5
- movu [r2 + r8], m0
-
- lea r2, [r2 + 4 * r5]
- lea r0, [r0 + 4 * r3]
- lea r1, [r1 + 4 * r4]
-
+%macro PROCESS_ADDAVG_16x4_HBD_AVX512 0
+ movu ym0, [r0]
+ vinserti32x8 m0, [r0 + r3], 1
+ movu ym1, [r1]
+ vinserti32x8 m1, [r1 + r4], 1
+
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+
+ movu [r2], ym0
+ vextracti32x8 [r2 + r5], m0, 1
+
+ movu ym0, [r0 + 2 * r3]
+ vinserti32x8 m0, [r0 + r6], 1
+ movu ym1, [r1 + 2 * r4]
+ vinserti32x8 m1, [r1 + r7], 1
+
+ paddw m0, m1
+ pmulhrsw m0, m3
+ paddw m0, m4
+ pmaxsw m0, m2
+ pminsw m0, m5
+
+ movu [r2 + 2 * r5], ym0
+ vextracti32x8 [r2 + r8], m0, 1
+%endmacro
+
+%macro PROCESS_ADDAVG_32x4_HBD_AVX512 0
movu m0, [r0]
movu m1, [r1]
paddw m0, m1
@@ -1885,6 +1875,52 @@
;-----------------------------------------------------------------------------
;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal addAvg_16x4, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+ PROCESS_ADDAVG_16x4_HBD_AVX512
+ RET
+
+%macro ADDAVG_W16_HBD_AVX512 1
+INIT_ZMM avx512
+cglobal addAvg_16x%1, 6,9,6
+ vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND]
+ vbroadcasti32x8 m5, [pw_pixel_max]
+ vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR]
+ pxor m2, m2
+ add r3, r3
+ add r4, r4
+ add r5, r5
+ lea r6, [3 * r3]
+ lea r7, [3 * r4]
+ lea r8, [3 * r5]
+
+%rep %1/4 - 1
+ PROCESS_ADDAVG_16x4_HBD_AVX512
+ lea r2, [r2 + 4 * r5]
+ lea r0, [r0 + 4 * r3]
+ lea r1, [r1 + 4 * r4]
+%endrep
+ PROCESS_ADDAVG_16x4_HBD_AVX512
+ RET
+%endmacro
+
+ADDAVG_W16_HBD_AVX512 8
+ADDAVG_W16_HBD_AVX512 12
+ADDAVG_W16_HBD_AVX512 16
+ADDAVG_W16_HBD_AVX512 24
+ADDAVG_W16_HBD_AVX512 32
+ADDAVG_W16_HBD_AVX512 64
+
%macro ADDAVG_W32_HBD_AVX512 1
INIT_ZMM avx512
cglobal addAvg_32x%1, 6,9,6
@@ -1899,13 +1935,13 @@
lea r7, [3 * r4]
lea r8, [3 * r5]
-%rep %1/8 - 1
- PROCESS_ADDAVG_32x8_HBD_AVX512
+%rep %1/4 - 1
+ PROCESS_ADDAVG_32x4_HBD_AVX512
lea r2, [r2 + 4 * r5]
lea r0, [r0 + 4 * r3]
lea r1, [r1 + 4 * r4]
%endrep
- PROCESS_ADDAVG_32x8_HBD_AVX512
+ PROCESS_ADDAVG_32x4_HBD_AVX512
RET
%endmacro
More information about the x265-devel
mailing list