[x265] [PATCH 091 of 307] x86: AVX512 cleanup interp_4tap_horiz_pp_32xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:29 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1504242228 -19800
# Fri Sep 01 10:33:48 2017 +0530
# Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd
# Parent d9200885420957bccd4edea62bf87bbe8831bc62
x86: AVX512 cleanup interp_4tap_horiz_pp_32xN
diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530
@@ -4011,22 +4011,29 @@
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+ //i444 chroma_hpp
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512);
-
p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
-
p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+ //i422 chroma_hpp
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
+
+ //i420 chroma_hpp
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512);
+
p.weight_pp = PFX(weight_pp_avx512);
//i444 chroma_hps
diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530
@@ -150,8 +150,6 @@
const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
-const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15
-
ALIGN 64
const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
@@ -9881,31 +9879,30 @@
; m9 - store shuffle order table
movu ym5, [r0]
- vinserti32x8 m5, [r0 + 4], 1
+ vinserti32x8 m5, [r0 + r1], 1
+ movu ym7, [r0 + 4]
+ vinserti32x8 m7, [r0 + r1 + 4], 1
pshufb m6, m5, m2
- pshufb m5, m5, m1
+ pshufb m5, m1
+ pshufb m8, m7, m2
+ pshufb m7, m1
+
pmaddubsw m5, m0
+ pmaddubsw m7, m0
+ pmaddwd m5, m3
+ pmaddwd m7, m3
+
pmaddubsw m6, m0
- pmaddwd m5, m3
+ pmaddubsw m8, m0
pmaddwd m6, m3
-
- movu ym7, [r0 + r1]
- vinserti32x8 m7, [r0 + r1 + 4], 1
-
- pshufb m8, m7, m2
- pshufb m7, m7, m1
- pmaddubsw m7, m0
- pmaddubsw m8, m0
- pmaddwd m7, m3
pmaddwd m8, m3
- packssdw m5, m6
- packssdw m7, m8
+ packssdw m5, m7
+ packssdw m6, m8
pmulhrsw m5, m4
- pmulhrsw m7, m4
- packuswb m5, m7
- vpermd m5, m9, m5
+ pmulhrsw m6, m4
+ packuswb m5, m6
movu [r2], ym5
vextracti32x8 [r2 + r3], m5, 1
%endmacro
@@ -9947,7 +9944,7 @@
%macro IPFILTER_CHROMA_PP_32xN_AVX512 1
INIT_ZMM avx512
-cglobal interp_4tap_horiz_pp_32x%1, 4,6,10
+cglobal interp_4tap_horiz_pp_32x%1, 4,6,9
mov r4d, r4m
%ifdef PIC
@@ -9959,7 +9956,6 @@
vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512]
vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512]
- movu m9, [interp4_horiz_shuf_store1_avx512]
vbroadcasti32x8 m3, [pw_1]
vbroadcasti32x8 m4, [pw_512]
dec r0
More information about the x265-devel
mailing list