<div dir="ltr">Thanks, queued<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Mar 13, 2015 at 9:16 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1426218406 25200<br>
# Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c<br>
# Parent 1f125d14f656cfd253bd36c29a111764f007a349<br>
asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN<br>
---<br>
source/common/x86/ipfilter8.asm | 49 +++++++++++++++++---------------------<br>
2 files changed, 38 insertions(+), 27 deletions(-)<br>
<br>
diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Thu Mar 12 13:06:38 2015 -0500<br>
+++ b/source/common/x86/ipfilter8.asm Thu Mar 12 20:46:46 2015 -0700<br>
@@ -1749,10 +1749,10 @@<br>
;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
;-----------------------------------------------------------------------------------------------------------------------------<br>
<br>
-%macro IPFILTER_LUMA_PS_4x_AVX2 2<br>
+%macro IPFILTER_LUMA_PS_4xN_AVX2 1<br>
INIT_YMM avx2<br>
%if ARCH_X86_64 == 1<br>
-cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6<br>
+cglobal interp_8tap_horiz_ps_4x%1, 6,7,6<br>
mov r5d, r5m<br>
mov r4d, r4m<br>
%ifdef PIC<br>
@@ -1762,7 +1762,6 @@<br>
vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]<br>
%endif<br>
mova m1, [tab_Lm]<br>
- mov r9d, %2 ;height<br>
add r3d, r3d<br>
vbroadcasti128 m2, [pw_2000]<br>
<br>
@@ -1771,17 +1770,17 @@<br>
; m1 - shuffle order table<br>
; m2 - pw_2000<br>
<br>
- xor r10, r10 ; loop count variable<br>
sub r0, 3<br>
test r5d, r5d<br>
- jz .label<br>
- lea r8, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride<br>
- sub r0, r8 ; r0(src)-r8<br>
- add r9, 4 ; blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop)<br>
-<br>
-.label<br>
- add r10, 4<br>
-<br>
+ mov r5d, %1 ; loop count variable - height<br>
+ jz .preloop<br>
+ lea r6, [r1 * 3] ; r8 = (N / 2 - 1) * srcStride<br>
+ sub r0, r6 ; r0(src) - 3 * srcStride<br>
+ add r5d, 7 ; need extra 7 rows, just set a specially flag here, blkheight += N - 1 (7 - 3 = 4 ; since the last three rows not in loop)<br>
+<br>
+.preloop:<br>
+ lea r6, [r3 * 3]<br>
+.loop<br>
; Row 0-1<br>
vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
pshufb m3, m1 ; shuffled based on the col order tab_Lm<br>
@@ -1807,18 +1806,17 @@<br>
psubw m3, m2<br>
<br>
vextracti128 xm4, m3, 1<br>
- lea r7, [r3 * 3]<br>
movq [r2], xm3 ;row 0<br>
movhps [r2 + r3], xm3 ;row 1<br>
movq [r2 + r3 * 2], xm4 ;row 2<br>
- movhps [r2 + r7], xm4 ;row 3<br>
+ movhps [r2 + r6], xm4 ;row 3<br>
<br>
lea r0, [r0 + r1 * 2] ; first loop src ->5th row(i.e 4)<br>
lea r2, [r2 + r3 * 4] ; first loop dst ->5th row(i.e 4)<br>
- cmp r10, r9<br>
- jnz .label<br>
- test r5d, r5d<br>
- jz .end<br>
+ sub r5d, 4<br>
+ jz .end<br>
+ cmp r5d, 4<br>
+ jge .loop<br>
<br>
; Row 8-9<br>
vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
@@ -1830,15 +1828,13 @@<br>
phaddw m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]<br>
<br>
; Row 10<br>
- lea r0, [r0 + r1 * 2]<br>
- vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+ vbroadcasti128 m4, [r0 + r1 * 2] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
pshufb m4, m1<br>
pmaddubsw m4, m0<br>
phaddw m4, m4 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]<br>
phaddw m3, m4<br>
<br>
- mova m4, [interp8_hps_shuf]<br>
- vpermd m3, m4, m3<br>
+ vpermd m3, m5, m3 ; m5 don't broken in above<br>
psubw m3, m2<br>
<br>
vextracti128 xm4, m3, 1<br>
@@ -1846,14 +1842,13 @@<br>
movhps [r2 + r3], xm3<br>
movq [r2 + r3 * 2], xm4<br>
.end<br>
-RET<br>
-%endif<br>
-%endmacro<br>
-<br>
-<br>
- IPFILTER_LUMA_PS_4x_AVX2 4 , 4<br>
- IPFILTER_LUMA_PS_4x_AVX2 4 , 8<br>
- IPFILTER_LUMA_PS_4x_AVX2 4 , 16<br>
+ RET<br>
+%endif<br>
+%endmacro<br>
+<br>
+ IPFILTER_LUMA_PS_4xN_AVX2 4<br>
+ IPFILTER_LUMA_PS_4xN_AVX2 8<br>
+ IPFILTER_LUMA_PS_4xN_AVX2 16<br>
<br>
%macro IPFILTER_LUMA_PS_8xN_AVX2 1<br>
; TODO: verify and enable on X86 mode<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>