[x265] [PATCH 04 of 13] x86: AVX2 optimise luma_hpp 8xN and 48xN for high bit depth
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Fri Jul 6 11:18:04 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1523937990 -19800
# Tue Apr 17 09:36:30 2018 +0530
# Node ID 66b26da73cf8da135ddd9137f471093cddc2a6d8
# Parent e5c5ebfc390ff164b94d0360b99ac54d3e5e1ba0
x86: AVX2 optimise luma_hpp 8xN and 48xN for high bit depth
~35% boost over existing code
diff -r e5c5ebfc390f -r 66b26da73cf8 source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm Tue Apr 17 09:01:28 2018 +0530
+++ b/source/common/x86/h-ipfilter16.asm Tue Apr 17 09:36:30 2018 +0530
@@ -1383,98 +1383,90 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_HOR_LUMA_W8 1
-INIT_YMM avx2
-cglobal interp_8tap_horiz_pp_8x%1, 4,6,8
- add r1d, r1d
- add r3d, r3d
- sub r0, 6
- mov r4d, r4m
- shl r4d, 4
-%ifdef PIC
- lea r5, [h_tab_LumaCoeff]
- vpbroadcastq m0, [r5 + r4]
- vpbroadcastq m1, [r5 + r4 + 8]
-%else
- vpbroadcastq m0, [h_tab_LumaCoeff + r4]
- vpbroadcastq m1, [h_ab_LumaCoeff + r4 + 8]
-%endif
- mova m3, [interp8_hpp_shuf]
- mova m7, [pd_32]
- pxor m2, m2
+%macro PROCESS_IPFILTER_LUMA_PP_8x2_AVX2 0
+ movu xm7, [r0]
+ movu xm8, [r0 + 8]
+ vinserti128 m7, m7, [r0 + r1], 1
+ vinserti128 m8, m8, [r0 + r1 + 8], 1
+ pshufb m10, m7, m14
+ pshufb m7, m13
+ pshufb m11, m8, m14
+ pshufb m8, m13
- ; register map
- ; m0 , m1 interpolate coeff
-
- mov r4d, %1/2
+ pmaddwd m7, m0
+ pmaddwd m10, m1
+ paddd m7, m10
+ pmaddwd m10, m11, m3
+ pmaddwd m9, m8, m2
+ paddd m10, m9
+ paddd m7, m10
+ paddd m7, m4
+ psrad m7, INTERP_SHIFT_PP
-.loop:
- vbroadcasti128 m4, [r0]
- vbroadcasti128 m5, [r0 + 8]
- pshufb m4, m3
- pshufb m5, m3
-
- pmaddwd m4, m0
- pmaddwd m5, m1
- paddd m4, m5
-
- vbroadcasti128 m5, [r0 + 8]
- vbroadcasti128 m6, [r0 + 16]
- pshufb m5, m3
- pshufb m6, m3
+ movu xm9, [r0 + 16]
+ vinserti128 m9, m9, [r0 + r1 + 16], 1
+ pshufb m10, m9, m14
+ pshufb m9, m13
+ pmaddwd m8, m0
+ pmaddwd m11, m1
+ paddd m8, m11
+ pmaddwd m10, m3
+ pmaddwd m9, m2
+ paddd m9, m10
+ paddd m8, m9
+ paddd m8, m4
+ psrad m8, INTERP_SHIFT_PP
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
-
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m7
- psrad m4, INTERP_SHIFT_PP
+ packusdw m7, m8
+ pshufb m7, m12
+ CLIPW m7, m5, m6
+ movu [r2], xm7
+ vextracti128 [r2 + r3], m7, 1
+%endmacro
- packusdw m4, m4
- vpermq m4, m4, q2020
- CLIPW m4, m2, [pw_pixel_max]
- movu [r2], xm4
-
- vbroadcasti128 m4, [r0 + r1]
- vbroadcasti128 m5, [r0 + r1 + 8]
- pshufb m4, m3
- pshufb m5, m3
-
- pmaddwd m4, m0
- pmaddwd m5, m1
- paddd m4, m5
+%macro IPFILTER_LUMA_AVX2_8xN 1
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_8x%1, 5,6,15
+ shl r1d, 1
+ shl r3d, 1
+ sub r0, 6
+ mov r4d, r4m
+ shl r4d, 4
- vbroadcasti128 m5, [r0 + r1 + 8]
- vbroadcasti128 m6, [r0 + r1 + 16]
- pshufb m5, m3
- pshufb m6, m3
-
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
+%ifdef PIC
+ lea r5, [h_tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4]
+ vpbroadcastd m1, [r5 + r4 + 4]
+ vpbroadcastd m2, [r5 + r4 + 8]
+ vpbroadcastd m3, [r5 + r4 + 12]
+%else
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
+%endif
+ mova m13, [interp8_hpp_shuf1_load_avx512]
+ mova m14, [interp8_hpp_shuf2_load_avx512]
+ mova m12, [interp8_hpp_shuf1_store_avx512]
+ mova m4, [pd_32]
+ pxor m5, m5
+ mova m6, [pw_pixel_max]
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m7
- psrad m4, INTERP_SHIFT_PP
-
- packusdw m4, m4
- vpermq m4, m4, q2020
- CLIPW m4, m2, [pw_pixel_max]
- movu [r2 + r3], xm4
-
- lea r2, [r2 + 2 * r3]
- lea r0, [r0 + 2 * r1]
- dec r4d
- jnz .loop
+%rep %1/2 - 1
+ PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+%endrep
+ PROCESS_IPFILTER_LUMA_PP_8x2_AVX2
RET
%endmacro
-FILTER_HOR_LUMA_W8 4
-FILTER_HOR_LUMA_W8 8
-FILTER_HOR_LUMA_W8 16
-FILTER_HOR_LUMA_W8 32
+
+%if ARCH_X86_64
+ IPFILTER_LUMA_AVX2_8xN 4
+ IPFILTER_LUMA_AVX2_8xN 8
+ IPFILTER_LUMA_AVX2_8xN 16
+ IPFILTER_LUMA_AVX2_8xN 32
+%endif
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
@@ -1949,125 +1941,82 @@
;-------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-cglobal interp_8tap_horiz_pp_48x64, 4,6,8
- add r1d, r1d
- add r3d, r3d
- sub r0, 6
- mov r4d, r4m
- shl r4d, 4
-%ifdef PIC
- lea r5, [h_tab_LumaCoeff]
- vpbroadcastq m0, [r5 + r4]
- vpbroadcastq m1, [r5 + r4 + 8]
-%else
- vpbroadcastq m0, [h_tab_LumaCoeff + r4]
- vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8]
-%endif
- mova m3, [interp8_hpp_shuf]
- mova m7, [pd_32]
- pxor m2, m2
+%macro PROCESS_IPFILTER_LUMA_PP_48x1_AVX2 0
+ PROCESS_IPFILTER_LUMA_PP_32x1_AVX2
- ; register map
- ; m0 , m1 interpolate coeff
+ movu m7, [r0 + 2 * mmsize]
+ movu m8, [r0 + 8 + 2 * mmsize]
- mov r4d, 64
+ pshufb m10, m7, m14
+ pshufb m7, m13
+ pshufb m11, m8, m14
+ pshufb m8, m13
-.loop:
-%assign x 0
-%rep 2
- vbroadcasti128 m4, [r0 + x]
- vbroadcasti128 m5, [r0 + 8 + x]
- pshufb m4, m3
- pshufb m5, m3
-
- pmaddwd m4, m0
- pmaddwd m5, m1
- paddd m4, m5
-
- vbroadcasti128 m5, [r0 + 8 + x]
- vbroadcasti128 m6, [r0 + 16 + x]
- pshufb m5, m3
- pshufb m6, m3
+ pmaddwd m7, m0
+ pmaddwd m10, m1
+ paddd m7, m10
+ pmaddwd m10, m11, m3
+ pmaddwd m9, m8, m2
+ paddd m10, m9
+ paddd m7, m10
+ paddd m7, m4
+ psrad m7, INTERP_SHIFT_PP
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
-
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m7
- psrad m4, INTERP_SHIFT_PP
-
- packusdw m4, m4
- vpermq m4, m4, q2020
- CLIPW m4, m2, [pw_pixel_max]
- movu [r2 + x], xm4
-
- vbroadcasti128 m4, [r0 + 16 + x]
- vbroadcasti128 m5, [r0 + 24 + x]
- pshufb m4, m3
- pshufb m5, m3
+ movu m9, [r0 + 16 + 2 * mmsize]
+ pshufb m10, m9, m14
+ pshufb m9, m13
+ pmaddwd m8, m0
+ pmaddwd m11, m1
+ paddd m8, m11
+ pmaddwd m10, m3
+ pmaddwd m9, m2
+ paddd m9, m10
+ paddd m8, m9
+ paddd m8, m4
+ psrad m8, INTERP_SHIFT_PP
- pmaddwd m4, m0
- pmaddwd m5, m1
- paddd m4, m5
-
- vbroadcasti128 m5, [r0 + 24 + x]
- vbroadcasti128 m6, [r0 + 32 + x]
- pshufb m5, m3
- pshufb m6, m3
-
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
+ packusdw m7, m8
+ pshufb m7, m12
+ CLIPW m7, m5, m6
+ movu [r2 + 2 * mmsize], m7
+%endmacro
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m7
- psrad m4, INTERP_SHIFT_PP
-
- packusdw m4, m4
- vpermq m4, m4, q2020
- CLIPW m4, m2, [pw_pixel_max]
- movu [r2 + 16 + x], xm4
-
- vbroadcasti128 m4, [r0 + 32 + x]
- vbroadcasti128 m5, [r0 + 40 + x]
- pshufb m4, m3
- pshufb m5, m3
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_48x64, 5,6,15
+ shl r1d, 1
+ shl r3d, 1
+ sub r0, 6
+ mov r4d, r4m
+ shl r4d, 4
- pmaddwd m4, m0
- pmaddwd m5, m1
- paddd m4, m5
-
- vbroadcasti128 m5, [r0 + 40 + x]
- vbroadcasti128 m6, [r0 + 48 + x]
- pshufb m5, m3
- pshufb m6, m3
-
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
+%ifdef PIC
+ lea r5, [h_tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4]
+ vpbroadcastd m1, [r5 + r4 + 4]
+ vpbroadcastd m2, [r5 + r4 + 8]
+ vpbroadcastd m3, [r5 + r4 + 12]
+%else
+ vpbroadcastd m0, [h_tab_LumaCoeff + r4]
+ vpbroadcastd m1, [h_tab_LumaCoeff + r4 + 4]
+ vpbroadcastd m2, [h_tab_LumaCoeff + r4 + 8]
+ vpbroadcastd m3, [h_tab_LumaCoeff + r4 + 12]
+%endif
+ mova m13, [interp8_hpp_shuf1_load_avx512]
+ mova m14, [interp8_hpp_shuf2_load_avx512]
+ mova m12, [interp8_hpp_shuf1_store_avx512]
+ mova m4, [pd_32]
+ pxor m5, m5
+ mova m6, [pw_pixel_max]
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m7
- psrad m4, INTERP_SHIFT_PP
-
- packusdw m4, m4
- vpermq m4, m4, q2020
- CLIPW m4, m2, [pw_pixel_max]
- movu [r2 + 32 + x], xm4
-
-%assign x x+48
+%rep 63
+ PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
%endrep
-
- add r2, r3
- add r0, r1
- dec r4d
- jnz .loop
+ PROCESS_IPFILTER_LUMA_PP_48x1_AVX2
RET
+%endif
;-----------------------------------------------------------------------------------------------------------------------------
;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-04.patch
Type: text/x-patch
Size: 12821 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/1598bb93/attachment.bin>
More information about the x265-devel
mailing list