[x265] [PATCH 13 of 13] x86 : AVX2 Refactor luma_hps 8xN and 24x32
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Fri Jul 6 11:18:13 CEST 2018
# HG changeset patch
# User Jayashree <jayashree.c at multicorewareinc.com>
# Date 1524473214 -19800
# Mon Apr 23 14:16:54 2018 +0530
# Node ID 0106f9f2f867ee20893a317e98c60e9ca626e7d2
# Parent 52ec2b63b870a1ec0c4ce45abc89ac522ade3c1f
x86 : AVX2 Refactor luma_hps 8xN and 24x32
diff -r 52ec2b63b870 -r 0106f9f2f867 source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm Fri Apr 20 14:16:17 2018 +0530
+++ b/source/common/x86/h-ipfilter16.asm Mon Apr 23 14:16:54 2018 +0530
@@ -2133,12 +2133,43 @@
IPFILTER_LUMA_PS_4xN_AVX2 8
IPFILTER_LUMA_PS_4xN_AVX2 16
+ %macro PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 1
+
+ %assign x 0
+ %rep %1/8
+ vbroadcasti128 m4, [r0 + x]
+ vbroadcasti128 m5, [r0 + 8+ x]
+ pshufb m4, m3
+ pshufb m7, m5, m3
+ pmaddwd m4, m0
+ pmaddwd m7, m1
+ paddd m4, m7
+
+ vbroadcasti128 m6, [r0 + 16 + x]
+ pshufb m5, m3
+ pshufb m6, m3
+ pmaddwd m5, m0
+ pmaddwd m6, m1
+ paddd m5, m6
+
+ phaddd m4, m5
+ vpermq m4, m4, q3120
+ paddd m4, m2
+ vextracti128 xm5,m4, 1
+ psrad xm4, INTERP_SHIFT_PS
+ psrad xm5, INTERP_SHIFT_PS
+ packssdw xm4, xm5
+ movu [r2 + x], xm4
+ %assign x x+16
+ %endrep
+ %endmacro
+
%macro IPFILTER_LUMA_PS_8xN_AVX2 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
cglobal interp_8tap_horiz_ps_8x%1, 4, 6, 8
- add r1d, r1d
- add r3d, r3d
+ shl r1d, 1
+ shl r3d, 1
mov r4d, r4m
mov r5d, r5m
shl r4d, 4
@@ -2165,30 +2196,7 @@
add r4d, 7
.loop0:
- vbroadcasti128 m4, [r0]
- vbroadcasti128 m5, [r0 + 8]
- pshufb m4, m3
- pshufb m7, m5, m3
- pmaddwd m4, m0
- pmaddwd m7, m1
- paddd m4, m7
-
- vbroadcasti128 m6, [r0 + 16]
- pshufb m5, m3
- pshufb m6, m3
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
-
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m2
- vextracti128 xm5,m4, 1
- psrad xm4, INTERP_SHIFT_PS
- psrad xm5, INTERP_SHIFT_PS
- packssdw xm4, xm5
-
- movu [r2], xm4
+ PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 8
add r2, r3
add r0, r1
dec r4d
@@ -2232,36 +2240,9 @@
sub r0, r6
add r4d, 7
+
.loop0:
-%assign x 0
-%rep 24/8
- vbroadcasti128 m4, [r0 + x]
- vbroadcasti128 m5, [r0 + 8 + x]
- pshufb m4, m3
- pshufb m7, m5, m3
- pmaddwd m4, m0
- pmaddwd m7, m1
- paddd m4, m7
-
- vbroadcasti128 m6, [r0 + 16 + x]
- pshufb m5, m3
- pshufb m6, m3
- pmaddwd m5, m0
- pmaddwd m6, m1
- paddd m5, m6
-
- phaddd m4, m5
- vpermq m4, m4, q3120
- paddd m4, m2
- vextracti128 xm5,m4, 1
- psrad xm4, INTERP_SHIFT_PS
- psrad xm5, INTERP_SHIFT_PS
- packssdw xm4, xm5
-
- movu [r2 + x], xm4
- %assign x x+16
- %endrep
-
+ PROCESS_IPFILTER_LUMA_PS_8x1_AVX2 24
add r2, r3
add r0, r1
dec r4d
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265-13.patch
Type: text/x-patch
Size: 3816 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20180706/59d5cb3e/attachment.bin>
More information about the x265-devel
mailing list