<div dir="ltr">Thanks, queued<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Fri, Mar 13, 2015 at 9:16 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1426218406 25200<br>
# Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c<br>
# Parent  1f125d14f656cfd253bd36c29a111764f007a349<br>
asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN<br>
---<br>
 source/common/x86/ipfilter8.asm |   49 +++++++++++++++++---------------------<br>
 2 files changed, 38 insertions(+), 27 deletions(-)<br>
<br>
diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm   Thu Mar 12 13:06:38 2015 -0500<br>
+++ b/source/common/x86/ipfilter8.asm   Thu Mar 12 20:46:46 2015 -0700<br>
@@ -1749,10 +1749,10 @@<br>
 ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
 ;-----------------------------------------------------------------------------------------------------------------------------<br>
<br>
-%macro IPFILTER_LUMA_PS_4x_AVX2 2<br>
+%macro IPFILTER_LUMA_PS_4xN_AVX2 1<br>
 INIT_YMM avx2<br>
 %if ARCH_X86_64 == 1<br>
-cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6<br>
+cglobal interp_8tap_horiz_ps_4x%1, 6,7,6<br>
     mov                         r5d,               r5m<br>
     mov                         r4d,               r4m<br>
 %ifdef PIC<br>
@@ -1762,7 +1762,6 @@<br>
     vpbroadcastq                m0,                [tab_LumaCoeff + r4 * 8]<br>
 %endif<br>
     mova                        m1,                [tab_Lm]<br>
-    mov                         r9d,               %2                           ;height<br>
     add                         r3d,               r3d<br>
     vbroadcasti128              m2,                [pw_2000]<br>
<br>
@@ -1771,17 +1770,17 @@<br>
     ; m1 - shuffle order table<br>
     ; m2 - pw_2000<br>
<br>
-    xor                         r10,               r10                          ; loop count variable<br>
     sub                         r0,                3<br>
     test                        r5d,               r5d<br>
-    jz                          .label<br>
-    lea                         r8,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride<br>
-    sub                         r0,                r8                           ; r0(src)-r8<br>
-    add                         r9,                4                            ; blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)<br>
-<br>
-.label<br>
-      add                       r10,               4<br>
-<br>
+    mov                         r5d,               %1                           ; loop count variable - height<br>
+    jz                         .preloop<br>
+    lea                         r6,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride<br>
+    sub                         r0,                r6                           ; r0(src) - 3 * srcStride<br>
+    add                         r5d,               7                            ; need extra 7 rows, just set a specially flag here, blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)<br>
+<br>
+.preloop:<br>
+    lea                         r6,                [r3 * 3]<br>
+.loop<br>
     ; Row 0-1<br>
     vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
     pshufb                      m3,                m1                           ; shuffled based on the col order tab_Lm<br>
@@ -1807,18 +1806,17 @@<br>
     psubw                       m3,                m2<br>
<br>
     vextracti128                xm4,               m3,               1<br>
-    lea                         r7,                [r3 * 3]<br>
     movq                        [r2],              xm3                          ;row 0<br>
     movhps                      [r2 + r3],         xm3                          ;row 1<br>
     movq                        [r2 + r3 * 2],     xm4                          ;row 2<br>
-    movhps                      [r2 + r7],         xm4                          ;row 3<br>
+    movhps                      [r2 + r6],         xm4                          ;row 3<br>
<br>
     lea                         r0,                [r0 + r1 * 2]                ; first loop src ->5th row(i.e 4)<br>
     lea                         r2,                [r2 + r3 * 4]                ; first loop dst ->5th row(i.e 4)<br>
-    cmp                         r10,               r9<br>
-    jnz                         .label<br>
-    test                        r5d,               r5d<br>
-    jz                          .end<br>
+    sub                         r5d,               4<br>
+    jz                         .end<br>
+    cmp                         r5d,               4<br>
+    jge                        .loop<br>
<br>
     ; Row 8-9<br>
     vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
@@ -1830,15 +1828,13 @@<br>
     phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]<br>
<br>
     ; Row 10<br>
-    lea                         r0,                [r0 + r1 * 2]<br>
-    vbroadcasti128              m4,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+    vbroadcasti128              m4,                [r0 + r1 * 2]                ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
     pshufb                      m4,                m1<br>
     pmaddubsw                   m4,                m0<br>
     phaddw                      m4,                m4                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]<br>
     phaddw                      m3,                m4<br>
<br>
-    mova                        m4,                [interp8_hps_shuf]<br>
-    vpermd                      m3,                m4,            m3<br>
+    vpermd                      m3,                m5,            m3            ; m5 don't broken in above<br>
     psubw                       m3,                m2<br>
<br>
     vextracti128                xm4,               m3,            1<br>
@@ -1846,14 +1842,13 @@<br>
     movhps                      [r2 + r3],         xm3<br>
     movq                        [r2 + r3 * 2],     xm4<br>
 .end<br>
-RET<br>
-%endif<br>
-%endmacro<br>
-<br>
-<br>
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 4<br>
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 8<br>
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 16<br>
+    RET<br>
+%endif<br>
+%endmacro<br>
+<br>
+    IPFILTER_LUMA_PS_4xN_AVX2 4<br>
+    IPFILTER_LUMA_PS_4xN_AVX2 8<br>
+    IPFILTER_LUMA_PS_4xN_AVX2 16<br>
<br>
 %macro IPFILTER_LUMA_PS_8xN_AVX2 1<br>
 ; TODO: verify and enable on X86 mode<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>