[x265] [PATCH] asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
Min Chen
chenm003 at 163.com
Fri Mar 13 04:46:52 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426218406 25200
# Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c
# Parent 1f125d14f656cfd253bd36c29a111764f007a349
asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
---
source/common/x86/ipfilter8.asm | 49 +++++++++++++++++---------------------
2 files changed, 38 insertions(+), 27 deletions(-)
diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Thu Mar 12 13:06:38 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Thu Mar 12 20:46:46 2015 -0700
@@ -1749,10 +1749,10 @@
;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;-----------------------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA_PS_4x_AVX2 2
+%macro IPFILTER_LUMA_PS_4xN_AVX2 1
INIT_YMM avx2
%if ARCH_X86_64 == 1
-cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6
+cglobal interp_8tap_horiz_ps_4x%1, 6,7,6
mov r5d, r5m
mov r4d, r4m
%ifdef PIC
@@ -1762,7 +1762,6 @@
vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
%endif
mova m1, [tab_Lm]
- mov r9d, %2 ;height
add r3d, r3d
vbroadcasti128 m2, [pw_2000]
@@ -1771,17 +1770,17 @@
; m1 - shuffle order table
; m2 - pw_2000
- xor r10, r10 ; loop count variable
sub r0, 3
test r5d, r5d
- jz .label
- lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride
- sub r0, r8 ; r0(src)-r8
- add r9, 4 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop)
-
-.label
- add r10, 4
-
+ mov r5d, %1 ; loop count variable - height
+ jz .preloop
+ lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride
+ sub r0, r6 ; r0(src) - 3 * srcStride
+ add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop)
+
+.preloop:
+ lea r6, [r3 * 3]
+.loop
; Row 0-1
vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
pshufb m3, m1 ; shuffled based on the col order tab_Lm
@@ -1807,18 +1806,17 @@
psubw m3, m2
vextracti128 xm4, m3, 1
- lea r7, [r3 * 3]
movq [r2], xm3 ;row 0
movhps [r2 + r3], xm3 ;row 1
movq [r2 + r3 * 2], xm4 ;row 2
- movhps [r2 + r7], xm4 ;row 3
+ movhps [r2 + r6], xm4 ;row 3
lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4)
lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4)
- cmp r10, r9
- jnz .label
- test r5d, r5d
- jz .end
+ sub r5d, 4
+ jz .end
+ cmp r5d, 4
+ jge .loop
; Row 8-9
vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
@@ -1830,15 +1828,13 @@
phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
; Row 10
- lea r0, [r0 + r1 * 2]
- vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
pshufb m4, m1
pmaddubsw m4, m0
phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
phaddw m3, m4
- mova m4, [interp8_hps_shuf]
- vpermd m3, m4, m3
+ vpermd m3, m5, m3 ; m5 don't broken in above
psubw m3, m2
vextracti128 xm4, m3, 1
@@ -1846,14 +1842,13 @@
movhps [r2 + r3], xm3
movq [r2 + r3 * 2], xm4
.end
-RET
-%endif
-%endmacro
-
-
- IPFILTER_LUMA_PS_4x_AVX2 4 , 4
- IPFILTER_LUMA_PS_4x_AVX2 4 , 8
- IPFILTER_LUMA_PS_4x_AVX2 4 , 16
+ RET
+%endif
+%endmacro
+
+ IPFILTER_LUMA_PS_4xN_AVX2 4
+ IPFILTER_LUMA_PS_4xN_AVX2 8
+ IPFILTER_LUMA_PS_4xN_AVX2 16
%macro IPFILTER_LUMA_PS_8xN_AVX2 1
; TODO: verify and enable on X86 mode
More information about the x265-devel
mailing list