[x265] [PATCH 253 of 307] x86: AVX512 optimise chroma_hps_16xN for high bit depth
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:11 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar<vignesh at multicorewareinc.com>>
# Date 1522981507 25200
# Thu Apr 05 19:25:07 2018 -0700
# Node ID 75d5a01d97daad790cecd35b40ff4b0e4cc34cac
# Parent ddd64f4b2ff382d05e86708750b20332ed93f3c9
x86: AVX512 optimise chroma_hps_16xN for high bit depth
diff -r ddd64f4b2ff3 -r 75d5a01d97da source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Fri Dec 08 14:29:33 2017 +0530
+++ b/source/common/x86/ipfilter16.asm Thu Apr 05 19:25:07 2018 -0700
@@ -160,12 +160,15 @@
times 16 dw 58, -10
times 16 dw 4, -1
-const interp8_hpp_shuf1_load_avx512, times 2 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
-
-const interp8_hpp_shuf2_load_avx512, times 2 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-
-const interp8_hpp_shuf1_store_avx512, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
-
+ALIGN 64
+const interp8_hpp_shuf1_load_avx512, times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+
+ALIGN 64
+const interp8_hpp_shuf2_load_avx512, times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
+ALIGN 64
+const interp8_hpp_shuf1_store_avx512, times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+
SECTION .text
cextern pd_8
cextern pd_32
@@ -7135,32 +7138,23 @@
movu [r2], ym6
vextracti32x8 [r2 + r3], m6, 1
%endmacro
-
%macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0
movu ym6, [r0]
- movu ym7, [r0 + 8]
-
- pshufb ym8, ym6, ym3
- pshufb ym6, ym2
- pmaddwd ym6, ym0
- pmaddwd ym8, ym1
- paddd ym6, ym8
- paddd ym6, ym4
- psrad ym6, INTERP_SHIFT_PS
-
- pshufb ym8, ym7, ym3
- pshufb ym7, ym2
- pmaddwd ym7, ym0
- pmaddwd ym8, ym1
- paddd ym7, ym8
- paddd ym7, ym4
- psrad ym7, INTERP_SHIFT_PS
-
- packssdw ym6, ym7
- pshufb ym6, ym5
- movu [r2], ym6
-%endmacro
-
+ vinserti32x8 m6, [r0 + 8], 1
+
+ pshufb m8, m6, m3
+ pshufb m6, m2
+ pmaddwd m6, m0
+ pmaddwd m8, m1
+ paddd m6, m8
+ paddd m6, m4
+ psrad m6, INTERP_SHIFT_PS
+
+ vextracti32x8 ym7, m6, 1
+ packssdw ym6, ym7
+ pshufb ym6, ym5
+ movu [r2], ym6
+%endmacro
%macro IPFILTER_CHROMA_PS_AVX512_16xN 1
%if ARCH_X86_64 == 1
INIT_ZMM avx512
@@ -7177,10 +7171,10 @@
vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8]
vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4]
%endif
- vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512]
- vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512]
+ mova m2, [interp8_hpp_shuf1_load_avx512]
+ mova m3, [interp8_hpp_shuf2_load_avx512]
vbroadcasti32x4 m4, [INTERP_OFFSET_PS]
- vbroadcasti32x8 m5,[interp8_hpp_shuf1_store_avx512]
+ mova m5, [interp8_hpp_shuf1_store_avx512]
mov r6d, %1
sub r0, 2
test r5d, r5d
More information about the x265-devel
mailing list