[x265] [PATCH 235 of 307] x86: AVX512 interp_4tap_vert_ps_64xN
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:53 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1511781308 -19800
# Mon Nov 27 16:45:08 2017 +0530
# Node ID 1cd123613bbb28fd00da36a3cfe3765f8e07d00e
# Parent 283aa4d77cef296699167c041763d7115e7a88aa
x86: AVX512 interp_4tap_vert_ps_64xN
Size | AVX2 performance | AVX512 performance
----------------------------------------------
64x16 | 39.17x | 64.63x
64x32 | 40.14x | 64.98x
64x48 | 39.97x | 64.52x
64x64 | 40.32x | 64.93x
diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 27 16:45:08 2017 +0530
@@ -5087,6 +5087,11 @@
p.quant = PFX(quant_avx512);
p.nquant = PFX(nquant_avx512);
p.denoiseDct = PFX(denoise_dct_avx512);
+
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512);
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512);
}
#endif
}
diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Dec 04 17:38:29 2017 +0530
+++ b/source/common/x86/ipfilter8.asm Mon Nov 27 16:45:08 2017 +0530
@@ -243,10 +243,13 @@
const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
ALIGN 64
+interp4_vps_store1_avx512: dq 0, 1, 8, 9, 2, 3, 10, 11
+interp4_vps_store2_avx512: dq 4, 5, 12, 13, 6, 7, 14, 15
const interp4_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7
const interp4_hps_store_16xN_avx512, dq 0, 2, 1, 3, 4, 6, 5, 7
const interp8_hps_store_avx512, dq 0, 1, 4, 5, 2, 3, 6, 7
const interp8_vsp_store_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7
+
SECTION .text
cextern pb_128
cextern pw_1
@@ -10864,7 +10867,7 @@
%endif
;-------------------------------------------------------------------------------------------------------------
-;avx512 chroma_vpp code start
+;avx512 chroma_vpp and chroma_vps code start
;-------------------------------------------------------------------------------------------------------------
%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0
lea r5, [r0 + 4 * r1]
@@ -11157,7 +11160,7 @@
RET
%endif
-%macro PROCESS_CHROMA_VERT_PP_64x4_AVX512 0
+%macro PROCESS_CHROMA_VERT_64x4_AVX512 1
movu m0, [r0] ; m0 = row 0
movu m1, [r0 + r1] ; m1 = row 1
punpcklbw m2, m0, m1
@@ -11179,10 +11182,21 @@
paddw m2, m8
paddw m3, m9
+%ifidn %1,pp
pmulhrsw m2, m12
pmulhrsw m3, m12
packuswb m2, m3
movu [r2], m2
+%else
+ psubw m2, m12
+ psubw m3, m12
+ movu m8, m13
+ movu m9, m14
+ vpermi2q m8, m2, m3
+ vpermi2q m9, m2, m3
+ movu [r2], m8
+ movu [r2 + mmsize], m9
+%endif
lea r0, [r0 + r1 * 4]
movu m0, [r0] ; m0 = row 4
@@ -11194,10 +11208,22 @@
pmaddubsw m3, m10
paddw m4, m8
paddw m5, m9
+
+%ifidn %1,pp
pmulhrsw m4, m12
pmulhrsw m5, m12
packuswb m4, m5
movu [r2 + r3], m4
+%else
+ psubw m4, m12
+ psubw m5, m12
+ movu m8, m13
+ movu m9, m14
+ vpermi2q m8, m4, m5
+ vpermi2q m9, m4, m5
+ movu [r2 + r3], m8
+ movu [r2 + r3 + mmsize], m9
+%endif
movu m1, [r0 + r1] ; m1 = row 5
punpcklbw m4, m0, m1
@@ -11207,11 +11233,21 @@
paddw m6, m4
paddw m7, m5
+%ifidn %1,pp
pmulhrsw m6, m12
pmulhrsw m7, m12
packuswb m6, m7
movu [r2 + r3 * 2], m6
-
+%else
+ psubw m6, m12
+ psubw m7, m12
+ movu m8, m13
+ movu m9, m14
+ vpermi2q m8, m6, m7
+ vpermi2q m9, m6, m7
+ movu [r2 + 2 * r3], m8
+ movu [r2 + 2 * r3 + mmsize], m9
+%endif
movu m0, [r0 + r1 * 2] ; m0 = row 6
punpcklbw m6, m1, m0
punpckhbw m7, m1, m0
@@ -11219,16 +11255,27 @@
pmaddubsw m7, m11
paddw m2, m6
paddw m3, m7
+
+%ifidn %1,pp
pmulhrsw m2, m12
pmulhrsw m3, m12
packuswb m2, m3
movu [r2 + r5], m2
-%endmacro
-
-%macro FILTER_VER_PP_CHROMA_AVX512_64xN 1
-%if ARCH_X86_64 == 1
+%else
+ psubw m2, m12
+ psubw m3, m12
+ movu m8, m13
+ movu m9, m14
+ vpermi2q m8, m2, m3
+ vpermi2q m9, m2, m3
+ movu [r2 + r5], m8
+ movu [r2 + r5 + mmsize], m9
+%endif
+%endmacro
+
+%macro FILTER_VER_CHROMA_AVX512_64xN 2
INIT_ZMM avx512
-cglobal interp_4tap_vert_pp_64x%1, 4, 6, 13
+cglobal interp_4tap_vert_%1_64x%2, 4, 6, 15
mov r4d, r4m
shl r4d, 7
@@ -11241,26 +11288,39 @@
mova m11, [tab_ChromaCoeffVer_32_avx512 + r4 + mmsize]
%endif
+%ifidn %1,pp
+ vbroadcasti32x8 m12, [pw_512]
+%else
+ add r3d, r3d
+ vbroadcasti32x8 m12, [pw_2000]
+ mova m13, [interp4_vps_store1_avx512]
+ mova m14, [interp4_vps_store2_avx512]
+%endif
lea r4, [r1 * 3]
sub r0, r1
- vbroadcasti32x8 m12, [pw_512]
lea r5, [r3 * 3]
-%rep %1/4 - 1
- PROCESS_CHROMA_VERT_PP_64x4_AVX512
+%rep %2/4 - 1
+ PROCESS_CHROMA_VERT_64x4_AVX512 %1
lea r2, [r2 + r3 * 4]
%endrep
- PROCESS_CHROMA_VERT_PP_64x4_AVX512
- RET
-%endif
-%endmacro
-
-FILTER_VER_PP_CHROMA_AVX512_64xN 64
-FILTER_VER_PP_CHROMA_AVX512_64xN 48
-FILTER_VER_PP_CHROMA_AVX512_64xN 32
-FILTER_VER_PP_CHROMA_AVX512_64xN 16
-;-------------------------------------------------------------------------------------------------------------
-;avx512 chroma_vpp code end
+ PROCESS_CHROMA_VERT_64x4_AVX512 %1
+ RET
+%endmacro
+
+%if ARCH_X86_64 == 1
+FILTER_VER_CHROMA_AVX512_64xN pp, 64
+FILTER_VER_CHROMA_AVX512_64xN pp, 48
+FILTER_VER_CHROMA_AVX512_64xN pp, 32
+FILTER_VER_CHROMA_AVX512_64xN pp, 16
+
+FILTER_VER_CHROMA_AVX512_64xN ps, 64
+FILTER_VER_CHROMA_AVX512_64xN ps, 48
+FILTER_VER_CHROMA_AVX512_64xN ps, 32
+FILTER_VER_CHROMA_AVX512_64xN ps, 16
+%endif
+;-------------------------------------------------------------------------------------------------------------
+;avx512 chroma_vpp and chroma_vps code end
;-------------------------------------------------------------------------------------------------------------
;-------------------------------------------------------------------------------------------------------------
;avx512 chroma_vss code start
More information about the x265-devel
mailing list