[x265] [PATCH] asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN

Min Chen chenm003 at 163.com
Fri Mar 13 04:46:52 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426218406 25200
# Node ID 91da3d8069fdc0d937097ff3d9d6ae91e25b852c
# Parent  1f125d14f656cfd253bd36c29a111764f007a349
asm: improve ~5% on AVX2 interp_8tap_horiz_ps_4xN
---
 source/common/x86/ipfilter8.asm |   49 +++++++++++++++++---------------------
 2 files changed, 38 insertions(+), 27 deletions(-)

diff -r 1f125d14f656 -r 91da3d8069fd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Mar 12 13:06:38 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Thu Mar 12 20:46:46 2015 -0700
@@ -1749,10 +1749,10 @@
 ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
 
-%macro IPFILTER_LUMA_PS_4x_AVX2 2
+%macro IPFILTER_LUMA_PS_4xN_AVX2 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
-cglobal interp_8tap_horiz_ps_%1x%2, 6, 11, 6
+cglobal interp_8tap_horiz_ps_4x%1, 6,7,6
     mov                         r5d,               r5m
     mov                         r4d,               r4m
 %ifdef PIC
@@ -1762,7 +1762,6 @@
     vpbroadcastq                m0,                [tab_LumaCoeff + r4 * 8]
 %endif
     mova                        m1,                [tab_Lm]
-    mov                         r9d,               %2                           ;height
     add                         r3d,               r3d
     vbroadcasti128              m2,                [pw_2000]
 
@@ -1771,17 +1770,17 @@
     ; m1 - shuffle order table
     ; m2 - pw_2000
 
-    xor                         r10,               r10                          ; loop count variable
     sub                         r0,                3
     test                        r5d,               r5d
-    jz                          .label
-    lea                         r8,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride
-    sub                         r0,                r8                           ; r0(src)-r8
-    add                         r9,                4                            ; blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)
-
-.label
-      add                       r10,               4
-
+    mov                         r5d,               %1                           ; loop count variable - height
+    jz                         .preloop
+    lea                         r6,                [r1 * 3]                     ; r8 = (N / 2 - 1) * srcStride
+    sub                         r0,                r6                           ; r0(src) - 3 * srcStride
+    add                         r5d,               7                            ; need extra 7 rows, just set a specially flag here, blkheight += N - 1  (7 - 3 = 4 ; since the last three rows not in loop)
+
+.preloop:
+    lea                         r6,                [r3 * 3]
+.loop
     ; Row 0-1
     vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
     pshufb                      m3,                m1                           ; shuffled based on the col order tab_Lm
@@ -1807,18 +1806,17 @@
     psubw                       m3,                m2
 
     vextracti128                xm4,               m3,               1
-    lea                         r7,                [r3 * 3]
     movq                        [r2],              xm3                          ;row 0
     movhps                      [r2 + r3],         xm3                          ;row 1
     movq                        [r2 + r3 * 2],     xm4                          ;row 2
-    movhps                      [r2 + r7],         xm4                          ;row 3
+    movhps                      [r2 + r6],         xm4                          ;row 3
 
     lea                         r0,                [r0 + r1 * 2]                ; first loop src ->5th row(i.e 4)
     lea                         r2,                [r2 + r3 * 4]                ; first loop dst ->5th row(i.e 4)
-    cmp                         r10,               r9
-    jnz                         .label
-    test                        r5d,               r5d
-    jz                          .end             
+    sub                         r5d,               4
+    jz                         .end
+    cmp                         r5d,               4
+    jge                        .loop
 
     ; Row 8-9
     vbroadcasti128              m3,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
@@ -1830,15 +1828,13 @@
     phaddw                      m3,                m4                           ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
 
     ; Row 10
-    lea                         r0,                [r0 + r1 * 2]
-    vbroadcasti128              m4,                [r0]                         ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+    vbroadcasti128              m4,                [r0 + r1 * 2]                ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
     pshufb                      m4,                m1
     pmaddubsw                   m4,                m0
     phaddw                      m4,                m4                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
     phaddw                      m3,                m4 
 
-    mova                        m4,                [interp8_hps_shuf]
-    vpermd                      m3,                m4,            m3
+    vpermd                      m3,                m5,            m3            ; m5 don't broken in above
     psubw                       m3,                m2
 
     vextracti128                xm4,               m3,            1
@@ -1846,14 +1842,13 @@
     movhps                      [r2 + r3],         xm3
     movq                        [r2 + r3 * 2],     xm4
 .end
-RET
-%endif
-%endmacro
-
-
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 4
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 8
-    IPFILTER_LUMA_PS_4x_AVX2 4 , 16
+    RET
+%endif
+%endmacro
+
+    IPFILTER_LUMA_PS_4xN_AVX2 4
+    IPFILTER_LUMA_PS_4xN_AVX2 8
+    IPFILTER_LUMA_PS_4xN_AVX2 16
 
 %macro IPFILTER_LUMA_PS_8xN_AVX2 1
 ; TODO: verify and enable on X86 mode



More information about the x265-devel mailing list