[x265] [PATCH] asm: Optimizations and cleaups on ipfilter functions

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Feb 11 14:51:59 CET 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1392125003 -19800
#      Tue Feb 11 18:53:23 2014 +0530
# Node ID 7eccc042e269ead4ff5d32f4d853287e30c59044
# Parent  07b5d6b82f5fbcb78ecab12cb8abcf13c78fe552
asm: Optimizations and cleaups on ipfilter functions

diff -r 07b5d6b82f5f -r 7eccc042e269 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon Feb 10 15:05:04 2014 -0600
+++ b/source/common/x86/ipfilter8.asm	Tue Feb 11 18:53:23 2014 +0530
@@ -814,7 +814,7 @@
     pmaddwd     %8, [r5 + %10 * 16]
     paddd       %4, %7              ; R4 = L[1+2+3+4] -- Row 1
     paddd       %6, %8              ; R1 = H[1+2+3+4]
-%endmacro ; FILTER_HV8_START
+%endmacro ; FILTER_HV8_MID
 
 ; Round and Saturate
 %macro FILTER_HV8_END 4 ; output in [1, 3]
@@ -830,8 +830,7 @@
     packssdw    %3, %4
 
     ; TODO: is merge better? I think this way is short dependency link
-    packuswb    %1, %1
-    packuswb    %3, %3
+    packuswb    %1, %3
 %endmacro ; FILTER_HV8_END
 
 ;-----------------------------------------------------------------------------
@@ -899,8 +898,8 @@
     FILTER_HV8_MID      m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
     FILTER_HV8_END      m3, m0, m4, m1
 
-    movq        [r2],       m3
-    movq        [r2 + r3],  m4
+    movh        [r2],       m3
+    movhps      [r2 + r3],  m3
 
     lea         r0,         [r0 + 16 * 2]
     lea         r2,         [r2 + r3 * 2]
@@ -915,7 +914,7 @@
 ;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x4, 4, 7, 8
+cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -926,16 +925,15 @@
 %else
 movd        m0,        [tab_ChromaCoeff + r4 * 4]
 %endif
-
+lea         r4,        [r1 * 3]
+lea         r5,        [r0 + 4 * r1]
 pshufb      m0,        [tab_Cm]
-
 mova        m1,        [tab_c_512]
 
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
 movd        m4,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movd        m5,        [r5 + r1]
+movd        m5,        [r0 + r4]
 
 punpcklbw   m2,        m3
 punpcklbw   m6,        m4,        m5
@@ -943,7 +941,7 @@
 
 pmaddubsw   m2,        m0
 
-movd        m6,        [r0 + 4 * r1]
+movd        m6,        [r5]
 
 punpcklbw   m3,        m4
 punpcklbw   m7,        m5,        m6
@@ -954,16 +952,11 @@
 phaddw      m2,        m3
 
 pmulhrsw    m2,        m1
-packuswb    m2,        m2
-
-pextrw      [r2],      m2,  0
-pextrw      [r2 + r3], m2,  2
-
-lea         r5,        [r0 + 4 * r1]
-movd        m2,        [r5 + r1]
+
+movd        m7,        [r5 + r1]
 
 punpcklbw   m4,        m5
-punpcklbw   m3,        m6,        m2
+punpcklbw   m3,        m6,        m7
 punpcklbw   m4,        m3
 
 pmaddubsw   m4,        m0
@@ -971,19 +964,21 @@
 movd        m3,        [r5 + 2 * r1]
 
 punpcklbw   m5,        m6
-punpcklbw   m2,        m3
-punpcklbw   m5,        m2
+punpcklbw   m7,        m3
+punpcklbw   m5,        m7
 
 pmaddubsw   m5,        m0
 
 phaddw      m4,        m5
 
 pmulhrsw    m4,        m1
-packuswb    m4,        m4
-
-pextrw      [r2 + 2 * r3],    m4,    0
-lea         r6,               [r2 + 2 * r3]
-pextrw      [r6 + r3],        m4,    2
+packuswb    m2,        m4
+
+pextrw      [r2],      m2, 0
+pextrw      [r2 + r3], m2, 2
+lea         r2,        [r2 + 2 * r3]
+pextrw      [r2],      m2, 4
+pextrw      [r2 + r3], m2, 6
 
 RET
 
@@ -992,7 +987,7 @@
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W2_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x8, 4, 7, 8
+cglobal interp_4tap_vert_pp_2x8, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -1009,13 +1004,13 @@
 mova        m1,        [tab_c_512]
 
 mov         r4d,       %2
+lea         r5,        [3 * r1]
 
 .loop
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
 movd        m4,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movd        m5,        [r5 + r1]
+movd        m5,        [r0 + r5]
 
 punpcklbw   m2,        m3
 punpcklbw   m6,        m4,        m5
@@ -1023,7 +1018,8 @@
 
 pmaddubsw   m2,        m0
 
-movd        m6,        [r0 + 4 * r1]
+lea         r0,        [r0 + 4 * r1]
+movd        m6,        [r0]
 
 punpcklbw   m3,        m4
 punpcklbw   m7,        m5,        m6
@@ -1034,39 +1030,35 @@
 phaddw      m2,        m3
 
 pmulhrsw    m2,        m1
-packuswb    m2,        m2
-
-pextrw      [r2],      m2,  0
-pextrw      [r2 + r3], m2,  2
-
-lea         r5,        [r0 + 4 * r1]
-movd        m2,        [r5 + r1]
+
+movd        m7,        [r0 + r1]
 
 punpcklbw   m4,        m5
-punpcklbw   m3,        m6,        m2
+punpcklbw   m3,        m6,        m7
 punpcklbw   m4,        m3
 
 pmaddubsw   m4,        m0
 
-movd        m3,        [r5 + 2 * r1]
+movd        m3,        [r0 + 2 * r1]
 
 punpcklbw   m5,        m6
-punpcklbw   m2,        m3
-punpcklbw   m5,        m2
+punpcklbw   m7,        m3
+punpcklbw   m5,        m7
 
 pmaddubsw   m5,        m0
 
 phaddw      m4,        m5
 
 pmulhrsw    m4,        m1
-packuswb    m4,        m4
-
-pextrw      [r2 + 2 * r3],    m4,    0
-lea         r6,               [r2 + 2 * r3]
-pextrw      [r6 + r3],        m4,    2
-
-lea         r0,        [r0 + 4 * r1]
-lea         r2,        [r2 + 4 * r3]
+packuswb    m2,        m4
+
+pextrw      [r2],      m2, 0
+pextrw      [r2 + r3], m2, 2
+lea         r2,        [r2 + 2 * r3]
+pextrw      [r2],      m2, 4
+pextrw      [r2 + r3], m2, 6
+
+lea         r2,        [r2 + 2 * r3]
 
 sub         r4,        4
 jnz        .loop
@@ -1079,7 +1071,7 @@
 ; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x2, 4, 6, 8
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -1092,32 +1084,30 @@
 %endif
 
 pshufb      m0,        [tab_Cm]
-
-mova        m1,        [tab_c_512]
+lea         r5,        [r0 + 2 * r1]
 
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
-movd        m4,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
+movd        m4,        [r5]
 movd        m5,        [r5 + r1]
 
 punpcklbw   m2,        m3
-punpcklbw   m6,        m4,        m5
-punpcklbw   m2,        m6
+punpcklbw   m1,        m4,        m5
+punpcklbw   m2,        m1
 
 pmaddubsw   m2,        m0
 
-movd        m6,        [r0 + 4 * r1]
+movd        m1,        [r0 + 4 * r1]
 
 punpcklbw   m3,        m4
-punpcklbw   m5,        m6
+punpcklbw   m5,        m1
 punpcklbw   m3,        m5
 
 pmaddubsw   m3,        m0
 
 phaddw      m2,        m3
 
-pmulhrsw    m2,        m1
+pmulhrsw    m2,        [tab_c_512]
 packuswb    m2,        m2
 movd        [r2],      m2
 pextrd      [r2 + r3], m2,  1
@@ -1128,7 +1118,7 @@
 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x4, 4, 7, 8
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -1141,14 +1131,14 @@
 %endif
 
 pshufb      m0,        [tab_Cm]
-
 mova        m1,        [tab_c_512]
+lea         r5,        [r0 + 4 * r1]
+lea         r4,        [r1 * 3]
 
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
 movd        m4,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movd        m5,        [r5 + r1]
+movd        m5,        [r0 + r4]
 
 punpcklbw   m2,        m3
 punpcklbw   m6,        m4,        m5
@@ -1156,7 +1146,7 @@
 
 pmaddubsw   m2,        m0
 
-movd        m6,        [r0 + 4 * r1]
+movd        m6,        [r5]
 
 punpcklbw   m3,        m4
 punpcklbw   m7,        m5,        m6
@@ -1167,34 +1157,33 @@
 phaddw      m2,        m3
 
 pmulhrsw    m2,        m1
-packuswb    m2,        m2
+
+movd        m7,        [r5 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m7
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m7,        m3
+punpcklbw   m5,        m7
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+
+packuswb    m2,        m4
 movd        [r2],      m2
-pextrd      [r2 + r3], m2,  1
-
-lea         r5,        [r0 + 4 * r1]
-movd        m2,        [r5 + r1]
-
-punpcklbw   m4,        m5
-punpcklbw   m3,        m6,        m2
-punpcklbw   m4,        m3
-
-pmaddubsw   m4,        m0
-
-movd        m3,        [r5 + 2 * r1]
-
-punpcklbw   m5,        m6
-punpcklbw   m2,        m3
-punpcklbw   m5,        m2
-
-pmaddubsw   m5,        m0
-
-phaddw      m4,        m5
-
-pmulhrsw    m4,        m1
-packuswb    m4,        m4
-movd        [r2 + 2 * r3],      m4
-lea         r6,        [r2 + 2 * r3]
-pextrd      [r6 + r3], m4,  1
+pextrd      [r2 + r3], m2, 1
+lea         r2,        [r2 + 2 * r3]
+pextrd      [r2],      m2, 2
+pextrd      [r2 + r3], m2, 3
 
 RET
 
@@ -1203,7 +1192,7 @@
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W4_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
@@ -1221,12 +1210,13 @@
 
 mov         r4d,       %2
 
+lea         r5,        [3 * r1]
+
 .loop
 movd        m2,        [r0]
 movd        m3,        [r0 + r1]
 movd        m4,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movd        m5,        [r5 + r1]
+movd        m5,        [r0 + r5]
 
 punpcklbw   m2,        m3
 punpcklbw   m6,        m4,        m5
@@ -1234,7 +1224,8 @@
 
 pmaddubsw   m2,        m0
 
-movd        m6,        [r0 + 4 * r1]
+lea         r0,        [r0 + 4 * r1]
+movd        m6,        [r0]
 
 punpcklbw   m3,        m4
 punpcklbw   m7,        m5,        m6
@@ -1245,37 +1236,34 @@
 phaddw      m2,        m3
 
 pmulhrsw    m2,        m1
-packuswb    m2,        m2
+
+movd        m7,        [r0 + r1]
+
+punpcklbw   m4,        m5
+punpcklbw   m3,        m6,        m7
+punpcklbw   m4,        m3
+
+pmaddubsw   m4,        m0
+
+movd        m3,        [r0 + 2 * r1]
+
+punpcklbw   m5,        m6
+punpcklbw   m7,        m3
+punpcklbw   m5,        m7
+
+pmaddubsw   m5,        m0
+
+phaddw      m4,        m5
+
+pmulhrsw    m4,        m1
+packuswb    m2,        m4
 movd        [r2],      m2
 pextrd      [r2 + r3], m2,  1
-
-lea         r5,        [r0 + 4 * r1]
-movd        m2,        [r5 + r1]
-
-punpcklbw   m4,        m5
-punpcklbw   m3,        m6,        m2
-punpcklbw   m4,        m3
-
-pmaddubsw   m4,        m0
-
-movd        m3,        [r5 + 2 * r1]
-
-punpcklbw   m5,        m6
-punpcklbw   m2,        m3
-punpcklbw   m5,        m2
-
-pmaddubsw   m5,        m0
-
-phaddw      m4,        m5
-
-pmulhrsw    m4,        m1
-packuswb    m4,        m4
-movd        [r2 + 2 * r3],      m4
-lea         r6,        [r2 + 2 * r3]
-pextrd      [r6 + r3], m4,  1
-
-lea         r0,        [r0 + 4 * r1]
-lea         r2,        [r2 + 4 * r3]
+lea         r2,        [r2 + 2 * r3]
+pextrd      [r2],      m2, 2
+pextrd      [r2 + r3], m2, 3
+
+lea         r2,        [r2 + 2 * r3]
 
 sub         r4,        4
 jnz        .loop
@@ -1450,7 +1438,7 @@
 ; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x2, 4, 6, 8
+cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
 
 mov         r4d, r4m
 sub         r0, r1
@@ -1465,32 +1453,30 @@
 
 pshufb      m0, [tab_Cm]
 
-mova        m1, [pw_2000]
-
 movd        m2, [r0]
 movd        m3, [r0 + r1]
-movd        m4, [r0 + 2 * r1]
 lea         r5, [r0 + 2 * r1]
+movd        m4, [r5]
 movd        m5, [r5 + r1]
 
 punpcklbw   m2, m3
-punpcklbw   m6, m4, m5
-punpcklbw   m2, m6
+punpcklbw   m1, m4, m5
+punpcklbw   m2, m1
 
 pmaddubsw   m2, m0
 
-movd        m6, [r0 + 4 * r1]
+movd        m1, [r0 + 4 * r1]
 
 punpcklbw   m3, m4
-punpcklbw   m5, m6
+punpcklbw   m5, m1
 punpcklbw   m3, m5
 
 pmaddubsw   m3, m0
 
 phaddw      m2, m3
 
-psubw       m2, m1
-movlps      [r2], m2
+psubw       m2, [pw_2000]
+movh        [r2], m2
 movhps      [r2 + r3], m2
 
 RET
@@ -1499,7 +1485,7 @@
 ; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x4, 4, 7, 8
+cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
 
     mov        r4d, r4m
     sub        r0, r1
@@ -1514,13 +1500,13 @@
 
     pshufb     m0, [tab_Cm]
 
-    mova       m1, [pw_2000]
+    lea        r4, [r1 * 3]
+    lea        r5, [r0 + 4 * r1]
 
     movd       m2, [r0]
     movd       m3, [r0 + r1]
     movd       m4, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movd       m5, [r5 + r1]
+    movd       m5, [r0 + r4]
 
     punpcklbw  m2, m3
     punpcklbw  m6, m4, m5
@@ -1528,21 +1514,22 @@
 
     pmaddubsw  m2, m0
 
-    movd       m6, [r0 + 4 * r1]
+    movd       m6, [r5]
 
     punpcklbw  m3, m4
-    punpcklbw  m7, m5, m6
-    punpcklbw  m3, m7
+    punpcklbw  m1, m5, m6
+    punpcklbw  m3, m1
 
     pmaddubsw  m3, m0
 
     phaddw     m2, m3
 
+    mova       m1, [pw_2000]
+
     psubw      m2, m1
-    movlps     [r2], m2
+    movh       [r2], m2
     movhps     [r2 + r3], m2
 
-    lea        r5, [r0 + 4 * r1]
     movd       m2, [r5 + r1]
 
     punpcklbw  m4, m5
@@ -1562,9 +1549,9 @@
     phaddw     m4, m5
 
     psubw      m4, m1
-    movlps     [r2 + 2 * r3], m4
-    lea        r6, [r2 + 2 * r3]
-    movhps     [r6 + r3], m4
+    lea        r2, [r2 + 2 * r3]
+    movh       [r2], m4
+    movhps     [r2 + r3], m4
 
     RET
 
@@ -1573,7 +1560,7 @@
 ;---------------------------------------------------------------------------------------------------------------
 %macro FILTER_V_PS_W4_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
@@ -1591,13 +1578,13 @@
     mova       m1, [pw_2000]
 
     mov        r4d, %2/4
+    lea        r5, [3 * r1]
 
 .loop
     movd       m2, [r0]
     movd       m3, [r0 + r1]
     movd       m4, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movd       m5, [r5 + r1]
+    movd       m5, [r0 + r5]
 
     punpcklbw  m2, m3
     punpcklbw  m6, m4, m5
@@ -1605,7 +1592,8 @@
 
     pmaddubsw  m2, m0
 
-    movd       m6, [r0 + 4 * r1]
+    lea        r0, [r0 + 4 * r1]
+    movd       m6, [r0]
 
     punpcklbw  m3, m4
     punpcklbw  m7, m5, m6
@@ -1616,11 +1604,10 @@
     phaddw     m2, m3
 
     psubw      m2, m1
-    movlps     [r2], m2
+    movh       [r2], m2
     movhps     [r2 + r3], m2
 
-    lea        r5, [r0 + 4 * r1]
-    movd       m2, [r5 + r1]
+    movd       m2, [r0 + r1]
 
     punpcklbw  m4, m5
     punpcklbw  m3, m6, m2
@@ -1628,7 +1615,7 @@
 
     pmaddubsw  m4, m0
 
-    movd       m3, [r5 + 2 * r1]
+    movd       m3, [r0 + 2 * r1]
 
     punpcklbw  m5, m6
     punpcklbw  m2, m3
@@ -1639,12 +1626,11 @@
     phaddw     m4, m5
 
     psubw      m4, m1
-    movlps     [r2 + 2 * r3], m4
-    lea        r6, [r2 + 2 * r3]
-    movhps     [r6 + r3], m4
-
-    lea        r0, [r0 + 4 * r1]
-    lea        r2, [r2 + 4 * r3]
+    lea        r2, [r2 + 2 * r3]
+    movh       [r2], m4
+    movhps     [r2 + r3], m4
+
+    lea        r2, [r2 + 2 * r3]
 
     dec        r4d
     jnz        .loop
@@ -1659,15 +1645,15 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_V_PS_W8_H8_H16_H2 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
 
     mov        r4d, r4m
     sub        r0, r1
     add        r3d, r3d
 
 %ifdef PIC
-    lea        r6, [tab_ChromaCoeff]
-    movd       m5, [r6 + r4 * 4]
+    lea        r5, [tab_ChromaCoeff]
+    movd       m5, [r5 + r4 * 4]
 %else
     movd       m5, [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -1677,34 +1663,34 @@
     mova       m4, [pw_2000]
 
     mov        r4d, %2/2
+    lea        r5, [3 * r1]
 
 .loopH
     movq       m0, [r0]
     movq       m1, [r0 + r1]
     movq       m2, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movq       m3, [r5 + r1]
+    movq       m3, [r0 + r5]
 
     punpcklbw  m0, m1
-    punpcklbw  m7, m2, m3
+    punpcklbw  m1, m2
+    punpcklbw  m2, m3
 
     pmaddubsw  m0, m6
-    pmaddubsw  m7, m5
-
-    paddw      m0, m7
+    pmaddubsw  m2, m5
+
+    paddw      m0, m2
 
     psubw      m0, m4
     movu       [r2], m0
 
     movq       m0, [r0 + 4 * r1]
 
-    punpcklbw  m1, m2
-    punpcklbw  m7, m3, m0
+    punpcklbw  m3, m0
 
     pmaddubsw  m1, m6
-    pmaddubsw  m7, m5
-
-    paddw      m1, m7
+    pmaddubsw  m3, m5
+
+    paddw      m1, m3
     psubw      m1, m4
 
     movu       [r2 + r3], m1
@@ -1727,15 +1713,15 @@
 ;--------------------------------------------------------------------------------------------------------------
 %macro FILTER_V_PS_W8_H8_H16_H32 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
     add        r3d, r3d
 
 %ifdef PIC
-    lea        r6, [tab_ChromaCoeff]
-    movd       m5, [r6 + r4 * 4]
+    lea        r5, [tab_ChromaCoeff]
+    movd       m5, [r5 + r4 * 4]
 %else
     movd       m5, [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -1745,55 +1731,54 @@
     mova       m4, [pw_2000]
 
     mov        r4d, %2/4
+    lea        r5, [3 * r1]
 
 .loop
     movq       m0, [r0]
     movq       m1, [r0 + r1]
     movq       m2, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movq       m3, [r5 + r1]
+    movq       m3, [r0 + r5]
 
     punpcklbw  m0, m1
-    punpcklbw  m7, m2, m3
+    punpcklbw  m1, m2
+    punpcklbw  m2, m3
 
     pmaddubsw  m0, m6
-    pmaddubsw  m7, m5
+    pmaddubsw  m7, m2, m5
 
     paddw      m0, m7
 
     psubw       m0, m4
     movu       [r2], m0
 
-    movq       m0, [r0 + 4 * r1]
-
-    punpcklbw  m1, m2
-    punpcklbw  m7, m3, m0
+    lea        r0, [r0 + 4 * r1]
+    movq       m0, [r0]
+
+    punpcklbw  m3, m0
 
     pmaddubsw  m1, m6
-    pmaddubsw  m7, m5
+    pmaddubsw  m7, m3, m5
 
     paddw      m1, m7
 
     psubw      m1, m4
     movu       [r2 + r3], m1
 
-    lea        r6, [r0 + 4 * r1]
-    movq       m1, [r6 + r1]
-
-    punpcklbw  m2, m3
-    punpcklbw  m7, m0, m1
+    movq       m1, [r0 + r1]
+
+    punpcklbw  m0, m1
 
     pmaddubsw  m2, m6
-    pmaddubsw  m7, m5
-
-    paddw      m2, m7
+    pmaddubsw  m0, m5
+
+    paddw      m2, m0
 
     psubw      m2, m4
-    movu       [r2 + 2 * r3], m2
-
-    movq       m2, [r6 + 2 * r1]
-
-    punpcklbw  m3, m0
+    lea        r2, [r2 + 2 * r3]
+    movu       [r2], m2
+
+    movq       m2, [r0 + 2 * r1]
+
     punpcklbw  m1, m2
 
     pmaddubsw  m3, m6
@@ -1802,11 +1787,9 @@
     paddw      m3, m1
     psubw      m3, m4
 
-    lea        r5, [r2 + 2 * r3]
-    movu       [r5 + r3], m3
-
-    lea        r0, [r0 + 4 * r1]
-    lea        r2, [r2 + 4 * r3]
+    movu       [r2 + r3], m3
+
+    lea        r2, [r2 + 2 * r3]
 
     dec        r4d
     jnz        .loop
@@ -1821,15 +1804,15 @@
 ;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal interp_4tap_vert_ps_6x8, 4, 7, 8
+cglobal interp_4tap_vert_ps_6x8, 4, 6, 8
 
     mov        r4d, r4m
     sub        r0, r1
     add        r3d, r3d
 
 %ifdef PIC
-    lea        r6, [tab_ChromaCoeff]
-    movd       m5, [r6 + r4 * 4]
+    lea        r5, [tab_ChromaCoeff]
+    movd       m5, [r5 + r4 * 4]
 %else
     movd       m5, [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -1837,36 +1820,35 @@
     pshufb     m6, m5, [tab_Vm]
     pshufb     m5, [tab_Vm + 16]
     mova       m4, [pw_2000]
-
+    lea        r5, [3 * r1]
     mov        r4d, 2
 
 .loop
     movq       m0, [r0]
     movq       m1, [r0 + r1]
     movq       m2, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movq       m3, [r5 + r1]
+    movq       m3, [r0 + r5]
 
     punpcklbw  m0, m1
-    punpcklbw  m7, m2, m3
+    punpcklbw  m1, m2
+    punpcklbw  m2, m3
 
     pmaddubsw  m0, m6
-    pmaddubsw  m7, m5
+    pmaddubsw  m7, m2, m5
 
     paddw      m0, m7
-
     psubw      m0, m4
+
     movh       [r2], m0
     pshufd     m0, m0, 2
     movd       [r2 + 8], m0
 
-    movq       m0, [r0 + 4 * r1]
-
-    punpcklbw  m1, m2
-    punpcklbw  m7, m3, m0
+    lea        r0, [r0 + 4 * r1]
+    movq       m0, [r0]
+    punpcklbw  m3, m0
 
     pmaddubsw  m1, m6
-    pmaddubsw  m7, m5
+    pmaddubsw  m7, m3, m5
 
     paddw      m1, m7
     psubw      m1, m4
@@ -1875,25 +1857,21 @@
     pshufd     m1, m1, 2
     movd       [r2 + r3 + 8], m1
 
-    lea        r6, [r0 + 4 * r1]
-    movq       m1, [r6 + r1]
-
-    punpcklbw  m2, m3
-    punpcklbw  m7, m0, m1
+    movq       m1, [r0 + r1]
+    punpcklbw  m0, m1
 
     pmaddubsw  m2, m6
-    pmaddubsw  m7, m5
-
-    paddw      m2, m7
+    pmaddubsw  m0, m5
+
+    paddw      m2, m0
     psubw      m2, m4
 
-    movh       [r2 + 2 * r3], m2
+    lea        r2,[r2 + 2 * r3]
+    movh       [r2], m2
     pshufd     m2, m2, 2
-    movd       [r2 + 2 * r3 + 8], m2
-
-    movq       m2,[r6 + 2 * r1]
-
-    punpcklbw  m3, m0
+    movd       [r2 + 8], m2
+
+    movq       m2,[r0 + 2 * r1]
     punpcklbw  m1, m2
 
     pmaddubsw  m3, m6
@@ -1902,13 +1880,11 @@
     paddw      m3, m1
     psubw      m3, m4
 
-    lea        r5,[r2 + 2 * r3]
-    movh       [r5 + r3], m3
+    movh       [r2 + r3], m3
     pshufd     m3, m3, 2
-    movd       [r5 + r3 + 8], m3
-
-    lea        r0, [r0 + 4 * r1]
-    lea        r2, [r2 + 4 * r3]
+    movd       [r2 + r3 + 8], m3
+
+    lea        r2, [r2 + 2 * r3]
 
     dec        r4d
     jnz        .loop
@@ -1934,68 +1910,61 @@
     pshufb     m1, m0, [tab_Vm]
     pshufb     m0, [tab_Vm + 16]
 
-    mova       m7, [pw_2000]
-
     mov        r4d, 16/2
 
 .loop
     movu       m2, [r0]
     movu       m3, [r0 + r1]
 
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
+    punpcklbw  m4, m2, m3
+    punpckhbw  m2, m3
 
     pmaddubsw  m4, m1
     pmaddubsw  m2, m1
 
-    movu       m5, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movu       m3, [r5 + r1]
-
-    punpcklbw  m6, m5, m3,
-    punpckhbw  m5, m3,
-
+    lea        r0, [r0 + 2 * r1]
+    movu       m5, [r0]
+    movu       m7, [r0 + r1]
+
+    punpcklbw  m6, m5, m7
     pmaddubsw  m6, m0
-    pmaddubsw  m5, m0
-
     paddw      m4, m6
-    paddw      m2, m5
-
-    psubw      m4, m7
-    psubw      m2, m7
+
+    punpckhbw  m6, m5, m7
+    pmaddubsw  m6, m0
+    paddw      m2, m6
+
+    mova       m6, [pw_2000]
+
+    psubw      m4, m6
+    psubw      m2, m6
 
     movu       [r2], m4
     movh       [r2 + 16], m2
 
-    movu       m2, [r0 + r1]
-    movu       m3, [r0 + 2 * r1]
-
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
+    punpcklbw  m4, m3, m5
+    punpckhbw  m3, m5
 
     pmaddubsw  m4, m1
-    pmaddubsw  m2, m1
-
-    lea        r5, [r0 + 2 * r1]
-    movu       m5, [r5 + r1]
-    movu       m3, [r5 + 2 * r1]
-
-    punpcklbw  m6, m5, m3,
-    punpckhbw  m5, m3,
-
-    pmaddubsw  m6, m0
+    pmaddubsw  m3, m1
+
+    movu       m2, [r0 + 2 * r1]
+
+    punpcklbw  m5, m7, m2
+    punpckhbw  m7, m2
+
     pmaddubsw  m5, m0
-
-    paddw      m4, m6
-    paddw      m2, m5
-
-    psubw      m4, m7
-    psubw      m2, m7
+    pmaddubsw  m7, m0
+
+    paddw      m4, m5
+    paddw      m3, m7
+
+    psubw      m4, m6
+    psubw      m3, m6
 
     movu       [r2 + r3], m4
-    movh       [r2 + r3 + 16], m2
-
-    lea        r0, [r0 + 2 * r1]
+    movh       [r2 + r3 + 16], m3
+
     lea        r2, [r2 + 2 * r3]
 
     dec        r4d
@@ -2022,7 +1991,6 @@
 
     pshufb     m1, m0, [tab_Vm]
     pshufb     m0, [tab_Vm + 16]
-
     mov        r4d, %2/2
 
 .loop
@@ -2030,63 +1998,54 @@
     movu       m3, [r0 + r1]
 
     punpcklbw  m4, m2, m3
-    punpckhbw  m5, m2, m3
+    punpckhbw  m2, m3
 
     pmaddubsw  m4, m1
-    pmaddubsw  m5, m1
-
-    movu       m2, [r0 + 2 * r1]
-    lea        r5, [r0 + 2 * r1]
-    movu       m3, [r5 + r1]
-
-    punpcklbw  m6, m2, m3
-    punpckhbw  m7, m2, m3
-
+    pmaddubsw  m2, m1
+
+    lea        r0, [r0 + 2 * r1]
+    movu       m5, [r0]
+    movu       m7, [r0 + r1]
+
+    punpcklbw  m6, m5, m7
     pmaddubsw  m6, m0
+    paddw      m4, m6
+
+    punpckhbw  m6, m5, m7
+    pmaddubsw  m6, m0
+    paddw      m2, m6
+
+    mova       m6, [pw_2000]
+
+    psubw      m4, m6
+    psubw      m2, m6
+
+    movu       [r2], m4
+    movu       [r2 + 16], m2
+
+    punpcklbw  m4, m3, m5
+    punpckhbw  m3, m5
+
+    pmaddubsw  m4, m1
+    pmaddubsw  m3, m1
+
+    movu       m5, [r0 + 2 * r1]
+
+    punpcklbw  m2, m7, m5
+    punpckhbw  m7, m5
+
+    pmaddubsw  m2, m0
     pmaddubsw  m7, m0
 
-    paddw      m4, m6
-    paddw      m5, m7
-
-    mova       m6, [pw_2000]
+    paddw      m4, m2
+    paddw      m3, m7
 
     psubw      m4, m6
-    psubw      m5, m6
-
-    movu       [r2], m4
-    movu       [r2 + 16], m5
-
-    movu       m2, [r0 + r1]
-    movu       m3, [r0 + 2 * r1]
-
-    punpcklbw  m4, m2, m3
-    punpckhbw  m5, m2, m3
-
-    pmaddubsw  m4, m1
-    pmaddubsw  m5, m1
-
-    lea        r5, [r0 + 2 * r1]
-    movu       m2, [r5 + r1]
-    movu       m3, [r5 + 2 * r1]
-
-    punpcklbw  m6, m2, m3,
-    punpckhbw  m7, m2, m3,
-
-    pmaddubsw  m6, m0
-    pmaddubsw  m7, m0
-
-    paddw      m4, m6
-    paddw      m5, m7
-
-    mova       m6, [pw_2000]
-
-    psubw      m4, m6
-    psubw      m5, m6
+    psubw      m3, m6
 
     movu       [r2 + r3], m4
-    movu       [r2 + r3 + 16], m5
-
-    lea        r0, [r0 + 2 * r1]
+    movu       [r2 + r3 + 16], m3
+
     lea        r2, [r2 + 2 * r3]
 
     dec        r4d
@@ -2120,100 +2079,92 @@
     pshufb     m1, m0, [tab_Vm]
     pshufb     m0, [tab_Vm + 16]
 
-    mova       m7, [pw_2000]
-
     mov        r4d, 32/2
 
 .loop
     movu       m2, [r0]
     movu       m3, [r0 + r1]
 
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
+    punpcklbw  m4, m2, m3
+    punpckhbw  m2, m3
 
     pmaddubsw  m4, m1
     pmaddubsw  m2, m1
 
-    movu       m5, [r0 + 2 * r1]
     lea        r5, [r0 + 2 * r1]
-    movu       m3, [r5 + r1]
-
-    punpcklbw  m6, m5, m3,
-    punpckhbw  m5, m3
-
+
+    movu       m5, [r5]
+    movu       m7, [r5 + r1]
+
+    punpcklbw  m6, m5, m7
     pmaddubsw  m6, m0
-    pmaddubsw  m5, m0
-
     paddw      m4, m6
-    paddw      m2, m5
-
-    psubw      m4, m7
-    psubw      m2, m7
+
+    punpckhbw  m6, m5, m7
+    pmaddubsw  m6, m0
+    paddw      m2, m6
+
+    mova       m6, [pw_2000]
+
+    psubw      m4, m6
+    psubw      m2, m6
 
     movu       [r2], m4
     movu       [r2 + 16], m2
 
+    punpcklbw  m4, m3, m5
+    punpckhbw  m3, m5
+
+    pmaddubsw  m4, m1
+    pmaddubsw  m3, m1
+
+    movu       m2, [r5 + 2 * r1]
+
+    punpcklbw  m5, m7, m2
+    punpckhbw  m7, m2
+
+    pmaddubsw  m5, m0
+    pmaddubsw  m7, m0
+
+    paddw      m4, m5
+    paddw      m3, m7
+
+    psubw      m4, m6
+    psubw      m3, m6
+
+    movu       [r2 + r3], m4
+    movu       [r2 + r3 + 16], m3
+
     movq       m2, [r0 + 16]
     movq       m3, [r0 + r1 + 16]
-    movq       m4, [r0 + 2 * r1 + 16]
+    movq       m4, [r5 + 16]
     movq       m5, [r5 + r1 + 16]
 
     punpcklbw  m2, m3
-    punpcklbw  m4, m5
+    punpcklbw  m7, m4, m5
 
     pmaddubsw  m2, m1
-    pmaddubsw  m4, m0
-
-    paddw      m2, m4
-    psubw      m2, m7
+    pmaddubsw  m7, m0
+
+    paddw      m2, m7
+    psubw      m2, m6
 
     movu       [r2 + 32], m2
 
-    movu       m2, [r0 + r1]
-    movu       m3, [r0 + 2 * r1]
-
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
-
-    pmaddubsw  m4, m1
-    pmaddubsw  m2, m1
-
-    lea        r5, [r0 + 2 * r1]
-    movu       m5, [r5 +  r1]
-    movu       m3, [r5 + 2 * r1]
-
-    punpcklbw  m6, m5, m3,
-    punpckhbw  m5, m3
-
-    pmaddubsw  m6, m0
+    movq       m2, [r5 + 2 * r1 + 16]
+
+    punpcklbw  m3, m4
+    punpcklbw  m5, m2
+
+    pmaddubsw  m3, m1
     pmaddubsw  m5, m0
 
-    paddw      m4, m6
-    paddw      m2, m5
-
-    psubw      m4, m7
-    psubw      m2, m7
-
-    movu       [r2 + r3], m4
-    movu       [r2 + r3 + 16], m2
-
-    movq       m2, [r0 + r1 + 16]
-    movq       m3, [r0 + 2 * r1 + 16]
-    movq       m4, [r5 + r1 + 16]
-    movq       m5, [r5 + 2 * r1 + 16]
-
-    punpcklbw  m2, m3
-    punpcklbw  m4, m5
-
-    pmaddubsw  m2, m1
-    pmaddubsw  m4, m0
-
-    paddw      m2, m4
-
-    psubw      m2,  m7
-    movu       [r2 + r3 + 32], m2
-
-    lea        r0, [r0 + 2 * r1]
+    paddw      m3, m5
+    psubw      m3,  m6
+
+    movu       [r2 + r3 + 32], m3
+
+    mov        r0, r5
     lea        r2, [r2 + 2 * r3]
 
     dec        r4d
@@ -2249,18 +2200,18 @@
     movu       m2, [r0]
     movu       m3, [r0 + r1]
 
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
+    punpcklbw  m4, m2, m3
+    punpckhbw  m2, m3
 
     pmaddubsw  m4, m1
     pmaddubsw  m2, m1
 
-    movu       m3, [r0 + 2 * r1]
     lea        r5, [r0 + 2 * r1]
+    movu       m3, [r5]
     movu       m5, [r5 + r1]
 
     punpcklbw  m6, m3, m5
-    punpckhbw  m3, m5,
+    punpckhbw  m3, m5
 
     pmaddubsw  m6, m0
     pmaddubsw  m3, m0
@@ -2277,17 +2228,17 @@
     movu       m2, [r0 + 16]
     movu       m3, [r0 + r1 + 16]
 
-    punpcklbw  m4, m2, m3,
-    punpckhbw  m2, m3,
+    punpcklbw  m4, m2, m3
+    punpckhbw  m2, m3
 
     pmaddubsw  m4, m1
     pmaddubsw  m2, m1
 
-    movu       m3, [r0 + 2 * r1 + 16]
+    movu       m3, [r5 + 16]
     movu       m5, [r5 + r1 + 16]
 
     punpcklbw  m6, m3, m5
-    punpckhbw  m3, m5,
+    punpckhbw  m3, m5
 
     pmaddubsw  m6, m0
     pmaddubsw  m3, m0
@@ -2319,14 +2270,14 @@
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W8_H8_H16_H32 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
 
 %ifdef PIC
-lea         r6,        [tab_ChromaCoeff]
-movd        m5,        [r6 + r4 * 4]
+lea         r5,        [tab_ChromaCoeff]
+movd        m5,        [r5 + r4 * 4]
 %else
 movd        m5,        [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -2334,6 +2285,7 @@
 pshufb      m6,        m5,       [tab_Vm]
 pshufb      m5,        [tab_Vm + 16]
 mova        m4,        [tab_c_512]
+lea         r5,        [r1 * 3]
 
 mov         r4d,       %2
 
@@ -2341,14 +2293,14 @@
 movq        m0,        [r0]
 movq        m1,        [r0 + r1]
 movq        m2,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movq        m3,        [r5 + r1]
+movq        m3,        [r0 + r5]
 
 punpcklbw   m0,        m1
-punpcklbw   m7,        m2,        m3
+punpcklbw   m1,        m2
+punpcklbw   m2,        m3
 
 pmaddubsw   m0,        m6
-pmaddubsw   m7,        m5
+pmaddubsw   m7,        m2, m5
 
 paddw       m0,        m7
 
@@ -2356,13 +2308,13 @@
 packuswb    m0,        m0
 movh        [r2],      m0
 
-movq        m0,        [r0 + 4 * r1]
-
-punpcklbw   m1,        m2
-punpcklbw   m7,        m3,        m0
+lea         r0,        [r0 + 4 * r1]
+movq        m0,        [r0]
+
+punpcklbw   m3,        m0
 
 pmaddubsw   m1,        m6
-pmaddubsw   m7,        m5
+pmaddubsw   m7,        m3, m5
 
 paddw       m1,        m7
 
@@ -2370,25 +2322,19 @@
 packuswb    m1,        m1
 movh        [r2 + r3], m1
 
-lea         r6,        [r0 + 4 * r1]
-movq        m1,        [r6 + r1]
-
-punpcklbw   m2,        m3
-punpcklbw   m7,        m0,        m1
+movq        m1,        [r0 + r1]
+
+punpcklbw   m0,        m1
 
 pmaddubsw   m2,        m6
-pmaddubsw   m7,        m5
-
-paddw       m2,        m7
+pmaddubsw   m0,        m5
+
+paddw       m2,        m0
 
 pmulhrsw    m2,        m4
-packuswb    m2,        m2
-movh        [r2 + 2 * r3], m2
-
-movq        m2,        [r6 + 2 * r1]
-
-punpcklbw   m3,        m0
-punpcklbw   m1,        m2
+
+movq        m7,        [r0 + 2 * r1]
+punpcklbw   m1,        m7
 
 pmaddubsw   m3,        m6
 pmaddubsw   m1,        m5
@@ -2396,13 +2342,13 @@
 paddw       m3,        m1
 
 pmulhrsw    m3,        m4
-packuswb    m3,        m3
-
-lea         r5,        [r2 + 2 * r3]
-movh        [r5 + r3], m3
-
-lea         r0,        [r0 + 4 * r1]
-lea         r2,        [r2 + 4 * r3]
+packuswb    m2,        m3
+
+lea         r2,        [r2 + 2 * r3]
+movh        [r2],      m2
+movhps      [r2 + r3], m2
+
+lea         r2,        [r2 + 2 * r3]
 
 sub         r4,         4
 jnz        .loop
@@ -2418,14 +2364,14 @@
 ;-----------------------------------------------------------------------------
 %macro FILTER_V4_W6_H4 2
 INIT_XMM sse4
-cglobal interp_4tap_vert_pp_6x8, 4, 7, 8
+cglobal interp_4tap_vert_pp_6x8, 4, 6, 8
 
 mov         r4d,       r4m
 sub         r0,        r1
 
 %ifdef PIC
-lea         r6,        [tab_ChromaCoeff]
-movd        m5,        [r6 + r4 * 4]
+lea         r5,        [tab_ChromaCoeff]
+movd        m5,        [r5 + r4 * 4]
 %else
 movd        m5,        [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -2435,19 +2381,20 @@
 mova        m4,        [tab_c_512]
 
 mov         r4d,       %2
+lea         r5,        [3 * r1]
 
 .loop
 movq        m0,        [r0]
 movq        m1,        [r0 + r1]
 movq        m2,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movq        m3,        [r5 + r1]
+movq        m3,        [r0 + r5]
 
 punpcklbw   m0,        m1
-punpcklbw   m7,        m2,        m3
+punpcklbw   m1,        m2
+punpcklbw   m2,        m3
 
 pmaddubsw   m0,        m6
-pmaddubsw   m7,        m5
+pmaddubsw   m7,        m2, m5
 
 paddw       m0,        m7
 
@@ -2456,13 +2403,13 @@
 movd        [r2],      m0
 pextrw      [r2 + 4],  m0,    2
 
-movq        m0,        [r0 + 4 * r1]
-
-punpcklbw   m1,        m2
-punpcklbw   m7,        m3,        m0
+lea         r0,        [r0 + 4 * r1]
+
+movq        m0,        [r0]
+punpcklbw   m3,        m0
 
 pmaddubsw   m1,        m6
-pmaddubsw   m7,        m5
+pmaddubsw   m7,        m3, m5
 
 paddw       m1,        m7
 
@@ -2471,10 +2418,7 @@
 movd        [r2 + r3],      m1
 pextrw      [r2 + r3 + 4],  m1,    2
 
-lea         r6,        [r0 + 4 * r1]
-movq        m1,        [r6 + r1]
-
-punpcklbw   m2,        m3
+movq        m1,        [r0 + r1]
 punpcklbw   m7,        m0,        m1
 
 pmaddubsw   m2,        m6
@@ -2484,12 +2428,11 @@
 
 pmulhrsw    m2,        m4
 packuswb    m2,        m2
-movd        [r2 + 2 * r3],     m2
-pextrw      [r2 + 2 * r3 + 4], m2,    2
-
-movq        m2,        [r6 + 2 * r1]
-
-punpcklbw   m3,        m0
+lea         r2,        [r2 + 2 * r3]
+movd        [r2],      m2
+pextrw      [r2 + 4],  m2,    2
+
+movq        m2,        [r0 + 2 * r1]
 punpcklbw   m1,        m2
 
 pmaddubsw   m3,        m6
@@ -2500,12 +2443,10 @@
 pmulhrsw    m3,        m4
 packuswb    m3,        m3
 
-lea         r5,               [r2 + 2 * r3]
-movd        [r5 + r3],        m3
-pextrw      [r5 + r3 + 4],    m3,    2
-
-lea         r0,        [r0 + 4 * r1]
-lea         r2,        [r2 + 4 * r3]
+movd        [r2 + r3],        m3
+pextrw      [r2 + r3 + 4],    m3,    2
+
+lea         r2,        [r2 + 2 * r3]
 
 sub         r4,         4
 jnz        .loop
@@ -2534,72 +2475,65 @@
 pshufb      m1,        m0,       [tab_Vm]
 pshufb      m0,        [tab_Vm + 16]
 
-mova        m7,        [tab_c_512]
-
-mov          r4d,       %2
+mov         r4d,       %2
 
 .loop
 movu        m2,        [r0]
 movu        m3,        [r0 + r1]
 
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
 
 pmaddubsw   m4,        m1
 pmaddubsw   m2,        m1
 
-movu        m5,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movu        m3,        [r5 + r1]
-
-punpcklbw   m6,        m5,        m3,
-punpckhbw   m5,        m3,
-
+lea         r0,        [r0 + 2 * r1]
+movu        m5,        [r0]
+movu        m7,        [r0 + r1]
+
+punpcklbw   m6,        m5,        m7
 pmaddubsw   m6,        m0
-pmaddubsw   m5,        m0
-
 paddw       m4,        m6
-paddw       m2,        m5
-
-pmulhrsw    m4,        m7
-pmulhrsw    m2,        m7
+
+punpckhbw   m6,        m5,        m7
+pmaddubsw   m6,        m0
+paddw       m2,        m6
+
+mova        m6,        [tab_c_512]
+
+pmulhrsw    m4,        m6
+pmulhrsw    m2,        m6
 
 packuswb    m4,        m2
 
 movh         [r2],     m4
 pextrd       [r2 + 8], m4,  2
 
-movu        m2,        [r0 + r1]
-movu        m3,        [r0 + 2 * r1]
-
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
+punpcklbw   m4,        m3,        m5
+punpckhbw   m3,        m5
 
 pmaddubsw   m4,        m1
-pmaddubsw   m2,        m1
-
-lea         r5,        [r0 + 2 * r1]
-movu        m5,        [r5 + r1]
-movu        m3,        [r5 + 2 * r1]
-
-punpcklbw   m6,        m5,        m3,
-punpckhbw   m5,        m3,
-
-pmaddubsw   m6,        m0
-pmaddubsw   m5,        m0
-
-paddw       m4,        m6
-paddw       m2,        m5
-
-pmulhrsw    m4,        m7
-pmulhrsw    m2,        m7
-
-packuswb    m4,        m2
+pmaddubsw   m3,        m1
+
+movu        m5,        [r0 + 2 * r1]
+
+punpcklbw   m2,        m7,        m5
+punpckhbw   m7,        m5
+
+pmaddubsw   m2,        m0
+pmaddubsw   m7,        m0
+
+paddw       m4,        m2
+paddw       m3,        m7
+
+pmulhrsw    m4,        m6
+pmulhrsw    m3,        m6
+
+packuswb    m4,        m3
 
 movh        [r2 + r3],      m4
 pextrd      [r2 + r3 + 8],  m4,  2
 
-lea         r0,        [r0 + 2 * r1]
 lea         r2,        [r2 + 2 * r3]
 
 sub         r4,        2
@@ -2635,66 +2569,57 @@
 movu        m2,        [r0]
 movu        m3,        [r0 + r1]
 
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m5,        m2,        m3,
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
 
 pmaddubsw   m4,        m1
-pmaddubsw   m5,        m1
-
-movu        m2,        [r0 + 2 * r1]
-lea         r5,        [r0 + 2 * r1]
-movu        m3,        [r5 + r1]
-
-punpcklbw   m6,        m2,        m3,
-punpckhbw   m7,        m2,        m3,
-
+pmaddubsw   m2,        m1
+
+lea         r0,        [r0 + 2 * r1]
+movu        m5,        [r0]
+movu        m6,        [r0 + r1]
+
+punpckhbw   m7,        m5,        m6
+pmaddubsw   m7,        m0
+paddw       m2,        m7
+
+punpcklbw   m7,        m5,        m6
+pmaddubsw   m7,        m0
+paddw       m4,        m7
+
+mova        m7,        [tab_c_512]
+
+pmulhrsw    m4,        m7
+pmulhrsw    m2,        m7
+
+packuswb    m4,        m2
+
+movu        [r2],      m4
+
+punpcklbw   m4,        m3,        m5
+punpckhbw   m3,        m5
+
+pmaddubsw   m4,        m1
+pmaddubsw   m3,        m1
+
+movu        m5,        [r0 + 2 * r1]
+
+punpcklbw   m2,        m6,        m5
+punpckhbw   m6,        m5
+
+pmaddubsw   m2,        m0
 pmaddubsw   m6,        m0
-pmaddubsw   m7,        m0
-
-paddw       m4,        m6;
-paddw       m5,        m7;
-
-mova        m6,        [tab_c_512]
-
-pmulhrsw    m4,        m6
-pmulhrsw    m5,        m6
-
-packuswb    m4,        m5
-
-movu        [r2],      m4
-
-movu        m2,        [r0 + r1]
-movu        m3,        [r0 + 2 * r1]
-
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m5,        m2,        m3,
-
-pmaddubsw   m4,        m1
-pmaddubsw   m5,        m1
-
-lea         r5,        [r0 + 2 * r1]
-movu        m2,        [r5 + r1]
-movu        m3,        [r5 + 2 * r1]
-
-punpcklbw   m6,        m2,        m3,
-punpckhbw   m7,        m2,        m3,
-
-pmaddubsw   m6,        m0
-pmaddubsw   m7,        m0
-
-paddw       m4,        m6
-paddw       m5,        m7
-
-mova        m6,        [tab_c_512]
-
-pmulhrsw    m4,        m6
-pmulhrsw    m5,        m6
-
-packuswb    m4,        m5
+
+paddw       m4,        m2
+paddw       m3,        m6
+
+pmulhrsw    m4,        m7
+pmulhrsw    m3,        m7
+
+packuswb    m4,        m3
 
 movu        [r2 + r3],      m4
 
-lea         r0,        [r0 + 2 * r1]
 lea         r2,        [r2 + 2 * r3]
 
 sub         r4,        2
@@ -2728,43 +2653,66 @@
 pshufb      m1,        m0,       [tab_Vm]
 pshufb      m0,        [tab_Vm + 16]
 
-mova        m7,        [tab_c_512]
-
 mov         r4d,       %2
 
 .loop
 movu        m2,        [r0]
 movu        m3,        [r0 + r1]
 
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
 
 pmaddubsw   m4,        m1
 pmaddubsw   m2,        m1
 
-movu        m5,        [r0 + 2 * r1]
 lea         r5,        [r0 + 2 * r1]
-movu        m3,        [r5 + r1]
-
-punpcklbw   m6,        m5,        m3,
-punpckhbw   m5,        m3
-
+movu        m5,        [r5]
+movu        m7,        [r5 + r1]
+
+punpcklbw   m6,        m5,        m7
 pmaddubsw   m6,        m0
+paddw       m4,        m6
+
+punpckhbw   m6,        m5,        m7
+pmaddubsw   m6,        m0
+paddw       m2,        m6
+
+mova        m6,        [tab_c_512]
+
+pmulhrsw    m4,        m6
+pmulhrsw    m2,        m6
+
+packuswb    m4,        m2
+
+movu        [r2],      m4
+
+punpcklbw   m4,        m3,        m5
+punpckhbw   m3,        m5
+
+pmaddubsw   m4,        m1
+pmaddubsw   m3,        m1
+
+movu        m2,        [r5 + 2 * r1]
+
+punpcklbw   m5,        m7,        m2
+punpckhbw   m7,        m2
+
 pmaddubsw   m5,        m0
-
-paddw       m4,        m6
-paddw       m2,        m5
-
-pmulhrsw    m4,        m7
-pmulhrsw    m2,        m7
-
-packuswb    m4,        m2
-
-movu        [r2],      m4
+pmaddubsw   m7,        m0
+
+paddw       m4,        m5
+paddw       m3,        m7
+
+pmulhrsw    m4,        m6
+pmulhrsw    m3,        m6
+
+packuswb    m4,        m3
+
+movu        [r2 + r3],      m4
 
 movq        m2,        [r0 + 16]
 movq        m3,        [r0 + r1 + 16]
-movq        m4,        [r0 + 2 * r1 + 16]
+movq        m4,        [r5 + 16]
 movq        m5,        [r5 + r1 + 16]
 
 punpcklbw   m2,        m3
@@ -2775,57 +2723,28 @@
 
 paddw       m2,        m4
 
-pmulhrsw    m2,        m7
-packuswb    m2,        m2
+pmulhrsw    m2,        m6
+
+movq        m3,        [r0 + r1 + 16]
+movq        m4,        [r5 + 16]
+movq        m5,        [r5 + r1 + 16]
+movq        m7,        [r5 + 2 * r1 + 16]
+
+punpcklbw   m3,        m4
+punpcklbw   m5,        m7
+
+pmaddubsw   m3,        m1
+pmaddubsw   m5,        m0
+
+paddw       m3,        m5
+
+pmulhrsw    m3,        m6
+packuswb    m2,        m3
+
 movh        [r2 + 16], m2
-
-movu        m2,        [r0 + r1]
-movu        m3,        [r0 + 2 * r1]
-
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
-
-pmaddubsw   m4,        m1
-pmaddubsw   m2,        m1
-
-lea         r5,        [r0 + 2 * r1]
-movu        m5,        [r5 +  r1]
-movu        m3,        [r5 + 2 * r1]
-
-punpcklbw   m6,        m5,        m3,
-punpckhbw   m5,        m3
-
-pmaddubsw   m6,        m0
-pmaddubsw   m5,        m0
-
-paddw       m4,        m6
-paddw       m2,        m5
-
-pmulhrsw    m4,        m7
-pmulhrsw    m2,        m7
-
-packuswb    m4,        m2
-
-movu        [r2 + r3],      m4
-
-movq        m2,        [r0 + r1 + 16]
-movq        m3,        [r0 + 2 * r1 + 16]
-movq        m4,        [r5 + r1 + 16]
-movq        m5,        [r5 + 2 * r1 + 16]
-
-punpcklbw   m2,        m3
-punpcklbw   m4,        m5
-
-pmaddubsw   m2,        m1
-pmaddubsw   m4,        m0
-
-paddw       m2,        m4
-
-pmulhrsw    m2,        m7
-packuswb    m2,        m2
-movh        [r2 + r3 + 16], m2
-
-lea         r0,        [r0 + 2 * r1]
+movhps      [r2 + r3 + 16], m2
+
+mov         r0,        r5
 lea         r2,        [r2 + 2 * r3]
 
 sub         r4,        2
@@ -2863,18 +2782,18 @@
 movu        m2,        [r0]
 movu        m3,        [r0 + r1]
 
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
 
 pmaddubsw   m4,        m1
 pmaddubsw   m2,        m1
 
-movu        m3,        [r0 + 2 * r1]
 lea         r5,        [r0 + 2 * r1]
+movu        m3,        [r5]
 movu        m5,        [r5 + r1]
 
 punpcklbw   m6,        m3,        m5
-punpckhbw   m3,        m5,
+punpckhbw   m3,        m5
 
 pmaddubsw   m6,        m0
 pmaddubsw   m3,        m0
@@ -2892,17 +2811,17 @@
 movu        m2,        [r0 + 16]
 movu        m3,        [r0 + r1 + 16]
 
-punpcklbw   m4,        m2,        m3,
-punpckhbw   m2,        m3,
+punpcklbw   m4,        m2,        m3
+punpckhbw   m2,        m3
 
 pmaddubsw   m4,        m1
 pmaddubsw   m2,        m1
 
-movu        m3,        [r0 + 2 * r1 + 16]
+movu        m3,        [r5 + 16]
 movu        m5,        [r5 + r1 + 16]
 
 punpcklbw   m6,        m3,        m5
-punpckhbw   m3,        m5,
+punpckhbw   m3,        m5
 
 pmaddubsw   m6,        m0
 pmaddubsw   m3,        m0
diff -r 07b5d6b82f5f -r 7eccc042e269 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp	Mon Feb 10 15:05:04 2014 -0600
+++ b/source/test/ipfilterharness.cpp	Tue Feb 11 18:53:23 2014 +0530
@@ -158,7 +158,7 @@
         rand_coeffIdx = rand() % 8;                // Random coeffIdex in the filter
 
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
-        rand_dstStride = rand() % 100;              // Randomly generated dstStride
+        rand_dstStride = rand() % 100 + 32;         // Randomly generated dstStride
 
         opt(pixel_buff + 3 * rand_srcStride,
             rand_srcStride,
@@ -187,7 +187,7 @@
         rand_coeffIdx = rand() % 8;                // Random coeffIdex in the filter
 
         rand_srcStride = rand() % 100;              // Randomly generated srcStride
-        rand_dstStride = rand() % 100;              // Randomly generated dstStride
+        rand_dstStride = rand() % 100 + 32;         // Randomly generated dstStride
 
         ref(pixel_buff + 3 * rand_srcStride,
             rand_srcStride,



More information about the x265-devel mailing list