[x265] [PATCH 209 of 307] x86: AVX512 optimise scale1D128to64 code
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:27 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>
# Date 1511864533 -19800
# Tue Nov 28 15:52:13 2017 +0530
# Node ID af867976d51969b1770e6bcffd80e0389c88b561
# Parent 651bf679ed5c7ec6b68714e81d3c24664f08ec6a
x86: AVX512 optimise scale1D128to64 code
Previous performance : 16.10x
Performance after optimisation : 20.71x
diff -r 651bf679ed5c -r af867976d519 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Nov 28 15:09:00 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Nov 28 15:52:13 2017 +0530
@@ -26,7 +26,7 @@
%include "x86inc.asm"
%include "x86util.asm"
-SECTION_RODATA 32
+SECTION_RODATA 64
var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
@@ -4713,65 +4713,63 @@
%if HIGH_BIT_DEPTH == 0
INIT_ZMM avx512
-cglobal scale1D_128to64, 2, 2, 6
+cglobal scale1D_128to64, 2, 2, 7
pxor m4, m4
+ mova m6, [dequant_shuf1_avx512]
vbroadcasti32x8 m5, [pb_1]
;Top pixel
movu m0, [r1]
- movu m1, [r1 + 64]
- movu m2, [r1 + 128]
- movu m3, [r1 + 192]
-
- pmaddubsw m0, m0, m5
+ movu m1, [r1 + 1 * mmsize]
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+
+ pmaddubsw m0, m5
pavgw m0, m4
- pmaddubsw m1, m1, m5
+ pmaddubsw m1, m5
pavgw m1, m4
packuswb m0, m1
- vpermq m0, m0, q3120
- vshufi64x2 m0, m0, q3120
+ vpermq m0, m6, m0
movu [r0], m0
;Left pixel
- pmaddubsw m2, m2, m5
+ pmaddubsw m2, m5
pavgw m2, m4
- pmaddubsw m3, m3, m5
+ pmaddubsw m3, m5
pavgw m3, m4
packuswb m2, m3
- vpermq m2, m2, q3120
- vshufi64x2 m2, m2, q3120
- movu [r0 + 64], m2
+ vpermq m2, m6, m2
+ movu [r0 + mmsize], m2
RET
INIT_ZMM avx512
-cglobal scale1D_128to64_aligned, 2, 2, 6
+cglobal scale1D_128to64_aligned, 2, 2, 7
pxor m4, m4
+ mova m6, [dequant_shuf1_avx512]
vbroadcasti32x8 m5, [pb_1]
;Top pixel
mova m0, [r1]
- mova m1, [r1 + 64]
- mova m2, [r1 + 128]
- mova m3, [r1 + 192]
-
- pmaddubsw m0, m0, m5
+ mova m1, [r1 + 1 * mmsize]
+ mova m2, [r1 + 2 * mmsize]
+ mova m3, [r1 + 3 * mmsize]
+
+ pmaddubsw m0, m5
pavgw m0, m4
- pmaddubsw m1, m1, m5
+ pmaddubsw m1, m5
pavgw m1, m4
packuswb m0, m1
- vpermq m0, m0, q3120
- vshufi64x2 m0, m0, q3120
+ vpermq m0, m6, m0
mova [r0], m0
;Left pixel
- pmaddubsw m2, m2, m5
+ pmaddubsw m2, m5
pavgw m2, m4
- pmaddubsw m3, m3, m5
+ pmaddubsw m3, m5
pavgw m3, m4
packuswb m2, m3
- vpermq m2, m2, q3120
- vshufi64x2 m2, m2, q3120
- mova [r0 + 64], m2
+ vpermq m2, m6, m2
+ mova [r0 + mmsize], m2
RET
%endif
More information about the x265-devel
mailing list