[x265] [PATCH 12 of 13] x86:Avx2 optimize luma_hps 32xN , 64xN and 48xN
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Fri Jul 6 11:18:12 CEST 2018
# HG changeset patch
# User Jayashree <jayashree.c at multicorewareinc.com>
# Date 1524213977 -19800
# Fri Apr 20 14:16:17 2018 +0530
# Node ID 52ec2b63b870a1ec0c4ce45abc89ac522ade3c1f
# Parent 8ea2e5d0296aad4fba48ac36ff6d99d7770c7990
x86:Avx2 optimize luma_hps 32xN ,64xN and 48xN
diff -r 8ea2e5d0296a -r 52ec2b63b870 source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm Fri Apr 20 14:07:07 2018 +0530
+++ b/source/common/x86/h-ipfilter16.asm Fri Apr 20 14:16:17 2018 +0530
@@ -2268,116 +2268,7 @@
jnz .loop0
RET
%endif
-%macro IPFILTER_LUMA_PS_32_64_AVX2 2
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_8tap_horiz_ps_%1x%2, 4, 6, 8
- add r1d, r1d
- add r3d, r3d
- mov r4d, r4m
- mov r5d, r5m
- shl r4d, 6
-%ifdef PIC
- lea r6, [h_tab_LumaCoeffV]
- movu m0, [r6 + r4]
- movu m1, [r6 + r4 + mmsize]
-%else
- movu m0, [h_tab_LumaCoeffV + r4]
- movu m1, [h_tab_LumaCoeffV + r4 + mmsize]
-%endif
- mova m3, [interp8_hpp_shuf_new]
- vbroadcasti128 m2, [INTERP_OFFSET_PS]
-
- ; register map
- ; m0 , m1 interpolate coeff
-
- sub r0, 6
- test r5d, r5d
- mov r4d, %2
- jz .loop0
- lea r6, [r1*3]
- sub r0, r6
- add r4d, 7
-
-.loop0:
-%assign x 0
-%rep %1/16
- vbroadcasti128 m4, [r0 + x]
- vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x]
- pshufb m4, m3
- pshufb m5, m3
-
- pmaddwd m4, m0
- pmaddwd m7, m5, m1
- paddd m4, m7
- vextracti128 xm7, m4, 1
- paddd xm4, xm7
- paddd xm4, xm2
- psrad xm4, INTERP_SHIFT_PS
-
- vbroadcasti128 m6, [r0 + 16 + x]
- pshufb m6, m3
-
- pmaddwd m5, m0
- pmaddwd m7, m6, m1
- paddd m5, m7
- vextracti128 xm7, m5, 1
- paddd xm5, xm7
- paddd xm5, xm2
- psrad xm5, INTERP_SHIFT_PS
-
- packssdw xm4, xm5
- movu [r2 + x], xm4
-
- vbroadcasti128 m5, [r0 + 24 + x]
- pshufb m5, m3
-
- pmaddwd m6, m0
- pmaddwd m7, m5, m1
- paddd m6, m7
- vextracti128 xm7, m6, 1
- paddd xm6, xm7
- paddd xm6, xm2
- psrad xm6, INTERP_SHIFT_PS
-
- vbroadcasti128 m7, [r0 + 32 + x]
- pshufb m7, m3
-
- pmaddwd m5, m0
- pmaddwd m7, m1
- paddd m5, m7
- vextracti128 xm7, m5, 1
- paddd xm5, xm7
- paddd xm5, xm2
- psrad xm5, INTERP_SHIFT_PS
-
- packssdw xm6, xm5
- movu [r2 + 16 + x], xm6
-
-%assign x x+32
-%endrep
-
- add r2, r3
- add r0, r1
- dec r4d
- jnz .loop0
- RET
-%endif
-%endmacro
-
- IPFILTER_LUMA_PS_32_64_AVX2 32, 8
- IPFILTER_LUMA_PS_32_64_AVX2 32, 16
- IPFILTER_LUMA_PS_32_64_AVX2 32, 24
- IPFILTER_LUMA_PS_32_64_AVX2 32, 32
- IPFILTER_LUMA_PS_32_64_AVX2 32, 64
-
- IPFILTER_LUMA_PS_32_64_AVX2 64, 16
- IPFILTER_LUMA_PS_32_64_AVX2 64, 32
- IPFILTER_LUMA_PS_32_64_AVX2 64, 48
- IPFILTER_LUMA_PS_32_64_AVX2 64, 64
-
- IPFILTER_LUMA_PS_32_64_AVX2 48, 64
%macro PROCESS_IPFILTER_LUMA_PS_16x1_AVX2 0
movu m7, [r0]
@@ -2470,7 +2361,289 @@
IPFILTER_LUMA_PS_16xN_AVX2 16
IPFILTER_LUMA_PS_16xN_AVX2 32
IPFILTER_LUMA_PS_16xN_AVX2 64
+%macro PROCESS_IPFILTER_LUMA_PS_32x1_AVX2 0
+ PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
+ movu m7, [r0 + mmsize]
+ movu m8, [r0 + 8+ mmsize]
+ pshufb m10, m7, m14
+ pshufb m7, m13
+ pshufb m11, m8, m14
+ pshufb m8, m13
+ pmaddwd m7, m0
+ pmaddwd m10, m1
+ paddd m7, m10
+ pmaddwd m10, m11, m3
+ pmaddwd m9, m8, m2
+ paddd m10, m9
+ paddd m7, m10
+ paddd m7, m4
+ psrad m7, INTERP_SHIFT_PS
+ movu m9, [r0 + 16+ mmsize]
+ pshufb m10, m9, m14
+ pshufb m9, m13
+ pmaddwd m8, m0
+ pmaddwd m11, m1
+ paddd m8, m11
+ pmaddwd m10, m3
+ pmaddwd m9, m2
+ paddd m9, m10
+ paddd m8, m9
+ paddd m8, m4
+ psrad m8, INTERP_SHIFT_PS
+ packssdw m7, m8
+ pshufb m7, m12
+ movu [r2+ mmsize], m7
+%endmacro
+
+%macro IPFILTER_LUMA_PS_32xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal interp_8tap_horiz_ps_32x%1, 5, 6, 15
+
+ shl r1d, 1
+ shl r3d, 1
+ mov r4d, r4m
+ mov r5d, r5m
+ shl r4d, 4
+%ifdef PIC
+ lea r6, [h_tab_LumaCoeff]
+ vpbroadcastd m0, [r6 + r4]
+ vpbroadcastd m1, [r6 + r4 + 4]
+ vpbroadcastd m2, [r6 + r4 + 8]
+ vpbroadcastd m3, [r6 + r4 + 12]
+%else
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
+%endif
+ mova m13, [interp8_hpp_shuf1_load_avx512]
+ mova m14, [interp8_hpp_shuf2_load_avx512]
+ mova m12, [interp8_hpp_shuf1_store_avx512]
+ vbroadcasti128 m4, [INTERP_OFFSET_PS]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+
+ sub r0, 6
+ test r5d, r5d
+ mov r4d, %1
+ jz .loop0
+ lea r6, [r1*3]
+ sub r0, r6
+ add r4d, 7
+
+.loop0:
+ PROCESS_IPFILTER_LUMA_PS_32x1_AVX2
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ ;add r2, r3
+ ;add r0, r1
+ dec r4d
+ jnz .loop0
+ RET
+%endif
+%endmacro
+
+ IPFILTER_LUMA_PS_32xN_AVX2 8
+ IPFILTER_LUMA_PS_32xN_AVX2 16
+ IPFILTER_LUMA_PS_32xN_AVX2 24
+ IPFILTER_LUMA_PS_32xN_AVX2 32
+ IPFILTER_LUMA_PS_32xN_AVX2 64
+
+%macro PROCESS_IPFILTER_LUMA_PS_64x1_AVX2 0
+ PROCESS_IPFILTER_LUMA_PS_16x1_AVX2
+%assign x 32
+%rep 3
+ movu m7, [r0 + x]
+ movu m8, [r0 + 8+ x]
+ pshufb m10, m7, m14
+ pshufb m7, m13
+ pshufb m11, m8, m14
+ pshufb m8, m13
+
+ pmaddwd m7, m0
+ pmaddwd m10, m1
+ paddd m7, m10
+ pmaddwd m10, m11, m3
+ pmaddwd m9, m8, m2
+ paddd m10, m9
+ paddd m7, m10
+ paddd m7, m4
+ psrad m7, INTERP_SHIFT_PS
+ movu m9, [r0 + 16+ x]
+ pshufb m10, m9, m14
+ pshufb m9, m13
+ pmaddwd m8, m0
+ pmaddwd m11, m1
+ paddd m8, m11
+ pmaddwd m10, m3
+ pmaddwd m9, m2
+ paddd m9, m10
+ paddd m8, m9
+ paddd m8, m4
+ psrad m8, INTERP_SHIFT_PS
+ packssdw m7, m8
+ pshufb m7, m12
+ movu [r2+ x], m7
+%assign x x+32
+%endrep
+%endmacro
+
+%macro IPFILTER_LUMA_PS_64xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal interp_8tap_horiz_ps_64x%1, 5, 6, 15
+
+ shl r1d, 1
+ shl r3d, 1
+ mov r4d, r4m
+ mov r5d, r5m
+ shl r4d, 4
+%ifdef PIC
+ lea r6, [h_tab_LumaCoeff]
+ vpbroadcastd m0, [r6 + r4]
+ vpbroadcastd m1, [r6 + r4 + 4]
+ vpbroadcastd m2, [r6 + r4 + 8]
+ vpbroadcastd m3, [r6 + r4 + 12]
+%else
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
+%endif
+ mova m13, [interp8_hpp_shuf1_load_avx512]
+ mova m14, [interp8_hpp_shuf2_load_avx512]
+ mova m12, [interp8_hpp_shuf1_store_avx512]
+ vbroadcasti128 m4, [INTERP_OFFSET_PS]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+
+ sub r0, 6
+ test r5d, r5d
+ mov r4d, %1
+ jz .loop0
+ lea r6, [r1*3]
+ sub r0, r6
+ add r4d, 7
+
+.loop0:
+ PROCESS_IPFILTER_LUMA_PS_64x1_AVX2
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+ ;add r2, r3
+ ;add r0, r1
+ dec r4d
+ jnz .loop0
+ RET
+%endif
+%endmacro
+
+ IPFILTER_LUMA_PS_64xN_AVX2 16
+ IPFILTER_LUMA_PS_64xN_AVX2 32
+ IPFILTER_LUMA_PS_64xN_AVX2 48
+ IPFILTER_LUMA_PS_64xN_AVX2 64
+
+%macro IPFILTER_LUMA_PS_48xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_8tap_horiz_ps_48x%1, 5, 9,15
+
+ add r1d, r1d
+ add r3d, r3d
+ mov r4d, r4m
+ mov r5d, r5m
+ shl r4d, 6
+%ifdef PIC
+ lea r6, [h_tab_LumaCoeffV]
+ movu m0, [r6 + r4]
+ movu m1, [r6 + r4 + mmsize]
+%else
+ movu m0, [h_tab_LumaCoeffV + r4]
+ movu m1, [h_tab_LumaCoeffV + r4 + mmsize]
+%endif
+ mova m3, [interp8_hpp_shuf_new]
+ vbroadcasti128 m2, [INTERP_OFFSET_PS]
+
+ ; register map
+ ; m0 , m1 interpolate coeff
+
+ sub r0, 6
+ test r5d, r5d
+ mov r4d, %1
+ jz .loop0
+ lea r6, [r1*3]
+ sub r0, r6
+ add r4d, 7
+
+.loop0:
+%assign x 0
+%rep 3
+ vbroadcasti128 m4, [r0 + x]
+ vbroadcasti128 m5, [r0 + 4 * SIZEOF_PIXEL + x]
+ pshufb m4, m3
+ pshufb m5, m3
+
+ pmaddwd m4, m0
+ pmaddwd m7, m5, m1
+ paddd m4, m7
+ vextracti128 xm7, m4, 1
+ paddd xm4, xm7
+ paddd xm4, xm2
+ psrad xm4, INTERP_SHIFT_PS
+
+ vbroadcasti128 m6, [r0 + 16 + x]
+ pshufb m6, m3
+
+ pmaddwd m5, m0
+ pmaddwd m7, m6, m1
+ paddd m5, m7
+ vextracti128 xm7, m5, 1
+ paddd xm5, xm7
+ paddd xm5, xm2
+ psrad xm5, INTERP_SHIFT_PS
+
+ packssdw xm4, xm5
+ movu [r2 + x], xm4
+
+ vbroadcasti128 m5, [r0 + 24 + x]
+ pshufb m5, m3
+
+ pmaddwd m6, m0
+ pmaddwd m7, m5, m1
+ paddd m6, m7
+ vextracti128 xm7, m6, 1
+ paddd xm6, xm7
+ paddd xm6, xm2
+ psrad xm6, INTERP_SHIFT_PS
+
+ vbroadcasti128 m7, [r0 + 32 + x]
+ pshufb m7, m3
+
+ pmaddwd m5, m0
+ pmaddwd m7, m1
+ paddd m5, m7
+ vextracti128 xm7, m5, 1
+ paddd xm5, xm7
+ paddd xm5, xm2
+ psrad xm5, INTERP_SHIFT_PS
+
+ packssdw xm6, xm5
+ movu [r2 + 16 + x], xm6
+
+%assign x x+32
+%endrep
+
+ add r2, r3
+ add r0, r1
+ dec r4d
+ jnz .loop0
+ RET
+%endif
+%endmacro
+ IPFILTER_LUMA_PS_48xN_AVX2 64
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_horiz_ps_12x16, 4, 6, 8
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-12.patch
Type: text/x-patch
Size: 12994 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/049804ff/attachment-0001.bin>
More information about the x265-devel
mailing list