[x265] [PATCH 063 of 307] x86: AVX512 pixel_avg_weight_W64 for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:01 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1501593145 -19800
# Tue Aug 01 18:42:25 2017 +0530
# Node ID fabc3475654f222b469c57b6cf8fd41b334d71be
# Parent ef7fd93923fa24a8f77a557817b03078356443e7
x86: AVX512 pixel_avg_weight_W64 for high bit depth
Size | AVX2 performance | AVX512 performance
----------------------------------------------
64x16 | 11.78x | 20.54x
64x32 | 12.08x | 23.01x
64x48 | 12.26x | 22.62x
64x64 | 12.35x | 22.67x
diff -r ef7fd93923fa -r fabc3475654f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:27:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 18:42:25 2017 +0530
@@ -2301,6 +2301,10 @@
p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx512);
p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx512);
p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx512);
+ p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512);
+ p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512);
+ p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512);
+ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512);
}
}
diff -r ef7fd93923fa -r fabc3475654f source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Tue Aug 01 18:27:37 2017 +0530
+++ b/source/common/x86/mc-a.asm Tue Aug 01 18:42:25 2017 +0530
@@ -5676,6 +5676,84 @@
movu [r0 + r8], m2
%endmacro
+%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0
+ movu m0, [r2]
+ movu m1, [r4]
+ movu m2, [r2 + r3]
+ movu m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + mmsize]
+ movu m1, [r4 + mmsize]
+ movu m2, [r2 + r3 + mmsize]
+ movu m3, [r4 + r5 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + mmsize], m0
+ movu [r0 + r1 + mmsize], m2
+
+ movu m0, [r2 + r3 * 2]
+ movu m1, [r4 + r5 * 2]
+ movu m2, [r2 + r6]
+ movu m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r8], m2
+
+ movu m0, [r2 + r3 * 2 + mmsize]
+ movu m1, [r4 + r5 * 2 + mmsize]
+ movu m2, [r2 + r6 + mmsize]
+ movu m3, [r4 + r7 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r8 + mmsize], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+
+ movu m0, [r2]
+ movu m1, [r4]
+ movu m2, [r2 + r3]
+ movu m3, [r4 + r5]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + mmsize]
+ movu m1, [r4 + mmsize]
+ movu m2, [r2 + r3 + mmsize]
+ movu m3, [r4 + r5 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + mmsize], m0
+ movu [r0 + r1 + mmsize], m2
+
+ movu m0, [r2 + r3 * 2]
+ movu m1, [r4 + r5 * 2]
+ movu m2, [r2 + r6]
+ movu m3, [r4 + r7]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r8], m2
+
+ movu m0, [r2 + r3 * 2 + mmsize]
+ movu m1, [r4 + r5 * 2 + mmsize]
+ movu m2, [r2 + r6 + mmsize]
+ movu m3, [r4 + r7 + mmsize]
+ pavgw m0, m1
+ pavgw m2, m3
+ movu [r0 + r1 * 2 + mmsize], m0
+ movu [r0 + r8 + mmsize], m2
+%endmacro
+
%macro PIXEL_AVG_HBD_W32 1
INIT_ZMM avx512
cglobal pixel_avg_32x%1, 6,9,4
@@ -5701,6 +5779,31 @@
PIXEL_AVG_HBD_W32 24
PIXEL_AVG_HBD_W32 32
PIXEL_AVG_HBD_W32 64
+
+%macro PIXEL_AVG_HBD_W64 1
+INIT_ZMM avx512
+cglobal pixel_avg_64x%1, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+
+%rep %1/8 - 1
+ PROCESS_PIXELAVG_64x8_HBD_AVX512
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endrep
+ PROCESS_PIXELAVG_64x8_HBD_AVX512
+ RET
+%endmacro
+
+PIXEL_AVG_HBD_W64 16
+PIXEL_AVG_HBD_W64 32
+PIXEL_AVG_HBD_W64 48
+PIXEL_AVG_HBD_W64 64
;-----------------------------------------------------------------------------
;pixel_avg_pp avx512 high bit depth code end
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list