[x265] [PATCH 2 of 3 ] x86: Split ipfilter16 kernals part2

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Tue Feb 20 09:06:23 CET 2018


# HG changeset patch
# User Mythreyi P <mythreyi at multicorewareinc.com>
# Date 1518510505 28800
#      Tue Feb 13 00:28:25 2018 -0800
# Node ID 53ecf411157337b133c75c418559a6f8bb01867a
# Parent  2f381d267c11dfeb6f3765c7105d7791242197f2
x86: Split ipfilter16 kernals part2

Port horizonal 4tap kernals from h-ipfilter16.asm to a new source file, h4-ipfilter16.asm to improve build time.

diff -r 2f381d267c11 -r 53ecf4111573 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Thu Feb 15 00:56:16 2018 -0800
+++ b/source/common/CMakeLists.txt	Tue Feb 13 00:28:25 2018 -0800
@@ -62,7 +62,7 @@
                mc-a2.asm pixel-util8.asm blockcopy8.asm
                pixeladd8.asm dct8.asm seaintegral.asm)
     if(HIGH_BIT_DEPTH)
-        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm)
+        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm h4-ipfilter16.asm h-ipfilter16.asm ipfilter16.asm loopfilter.asm)
     else()
         set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm v4-ipfilter8.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm)
     endif()
diff -r 2f381d267c11 -r 53ecf4111573 source/common/x86/h-ipfilter16.asm
--- a/source/common/x86/h-ipfilter16.asm	Thu Feb 15 00:56:16 2018 -0800
+++ b/source/common/x86/h-ipfilter16.asm	Tue Feb 13 00:28:25 2018 -0800
@@ -45,20 +45,7 @@
 
 SECTION_RODATA 32
 
-tab_c_32:         times 8 dd 32
 h_pd_524800:        times 8 dd 524800
-
-tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
-
-
-tab_ChromaCoeff:  dw  0, 64,  0,  0
-                  dw -2, 58, 10, -2
-                  dw -4, 54, 16, -2
-                  dw -6, 46, 28, -4
-                  dw -4, 36, 36, -4
-                  dw -4, 28, 46, -6
-                  dw -2, 16, 54, -4
-                  dw -2, 10, 58, -2
                                     
 tab_LumaCoeff:    dw   0, 0,  0,  64,  0,   0,  0,  0
                   dw  -1, 4, -10, 58,  17, -5,  1,  0
@@ -352,198 +339,6 @@
     FILTER_HOR_LUMA_sse2 64, 32, ps
     FILTER_HOR_LUMA_sse2 64, 48, ps
     FILTER_HOR_LUMA_sse2 64, 64, ps
-    
-%macro FILTERH_W2_4_sse3 2
-    movh        m3,     [r0 + %1]
-    movhps      m3,     [r0 + %1 + 2]
-    pmaddwd     m3,     m0
-    movh        m4,     [r0 + r1 + %1]
-    movhps      m4,     [r0 + r1 + %1 + 2]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m3,     q2301
-    paddd       m3,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m3,     m3,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m3,     m4
-    paddd       m3,     m1
-    movh        m5,     [r0 + 2 * r1 + %1]
-    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
-    pmaddwd     m5,     m0
-    movh        m4,     [r0 + r4 + %1]
-    movhps      m4,     [r0 + r4 + %1 + 2]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m5,     q2301
-    paddd       m5,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m5,     m5,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m5,     m4
-    paddd       m5,     m1
-%ifidn %2, pp
-    psrad       m3,     6
-    psrad       m5,     6
-    packssdw    m3,     m5
-    CLIPW       m3,     m7,     m6
-%else
-    psrad       m3,     INTERP_SHIFT_PS
-    psrad       m5,     INTERP_SHIFT_PS
-    packssdw    m3,     m5
-%endif
-    movd        [r2 + %1], m3
-    psrldq      m3,     4
-    movd        [r2 + r3 + %1], m3
-    psrldq      m3,     4
-    movd        [r2 + r3 * 2 + %1], m3
-    psrldq      m3,     4
-    movd        [r2 + r5 + %1], m3
-%endmacro
-
-%macro FILTERH_W2_3_sse3 1
-    movh        m3,     [r0 + %1]
-    movhps      m3,     [r0 + %1 + 2]
-    pmaddwd     m3,     m0
-    movh        m4,     [r0 + r1 + %1]
-    movhps      m4,     [r0 + r1 + %1 + 2]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m3,     q2301
-    paddd       m3,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m3,     m3,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m3,     m4
-    paddd       m3,     m1
-
-    movh        m5,     [r0 + 2 * r1 + %1]
-    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
-    pmaddwd     m5,     m0
-
-    pshufd      m2,     m5,     q2301
-    paddd       m5,     m2
-    pshufd      m5,     m5,     q3120
-    paddd       m5,     m1
-
-    psrad       m3,     INTERP_SHIFT_PS
-    psrad       m5,     INTERP_SHIFT_PS
-    packssdw    m3,     m5
-
-    movd        [r2 + %1], m3
-    psrldq      m3,     4
-    movd        [r2 + r3 + %1], m3
-    psrldq      m3,     4
-    movd        [r2 + r3 * 2 + %1], m3
-%endmacro
-
-%macro FILTERH_W4_2_sse3 2
-    movh        m3,     [r0 + %1]
-    movhps      m3,     [r0 + %1 + 2]
-    pmaddwd     m3,     m0
-    movh        m4,     [r0 + %1 + 4]
-    movhps      m4,     [r0 + %1 + 6]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m3,     q2301
-    paddd       m3,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m3,     m3,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m3,     m4
-    paddd       m3,     m1
-
-    movh        m5,     [r0 + r1 + %1]
-    movhps      m5,     [r0 + r1 + %1 + 2]
-    pmaddwd     m5,     m0
-    movh        m4,     [r0 + r1 + %1 + 4]
-    movhps      m4,     [r0 + r1 + %1 + 6]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m5,     q2301
-    paddd       m5,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m5,     m5,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m5,     m4
-    paddd       m5,     m1
-%ifidn %2, pp
-    psrad       m3,     6
-    psrad       m5,     6
-    packssdw    m3,     m5
-    CLIPW       m3,     m7,     m6
-%else
-    psrad       m3,     INTERP_SHIFT_PS
-    psrad       m5,     INTERP_SHIFT_PS
-    packssdw    m3,     m5
-%endif
-    movh        [r2 + %1], m3
-    movhps      [r2 + r3 + %1], m3
-%endmacro
-
-%macro FILTERH_W4_1_sse3 1
-    movh        m3,     [r0 + 2 * r1 + %1]
-    movhps      m3,     [r0 + 2 * r1 + %1 + 2]
-    pmaddwd     m3,     m0
-    movh        m4,     [r0 + 2 * r1 + %1 + 4]
-    movhps      m4,     [r0 + 2 * r1 + %1 + 6]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m3,     q2301
-    paddd       m3,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m3,     m3,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m3,     m4
-    paddd       m3,     m1
-
-    psrad       m3,     INTERP_SHIFT_PS
-    packssdw    m3,     m3
-    movh        [r2 + r3 * 2 + %1], m3
-%endmacro
-
-%macro FILTERH_W8_1_sse3 2
-    movh        m3,     [r0 + %1]
-    movhps      m3,     [r0 + %1 + 2]
-    pmaddwd     m3,     m0
-    movh        m4,     [r0 + %1 + 4]
-    movhps      m4,     [r0 + %1 + 6]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m3,     q2301
-    paddd       m3,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m3,     m3,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m3,     m4
-    paddd       m3,     m1
-
-    movh        m5,     [r0 + %1 + 8]
-    movhps      m5,     [r0 + %1 + 10]
-    pmaddwd     m5,     m0
-    movh        m4,     [r0 + %1 + 12]
-    movhps      m4,     [r0 + %1 + 14]
-    pmaddwd     m4,     m0
-    pshufd      m2,     m5,     q2301
-    paddd       m5,     m2
-    pshufd      m2,     m4,     q2301
-    paddd       m4,     m2
-    pshufd      m5,     m5,     q3120
-    pshufd      m4,     m4,     q3120
-    punpcklqdq  m5,     m4
-    paddd       m5,     m1
-%ifidn %2, pp
-    psrad       m3,     6
-    psrad       m5,     6
-    packssdw    m3,     m5
-    CLIPW       m3,     m7,     m6
-%else
-    psrad       m3,     INTERP_SHIFT_PS
-    psrad       m5,     INTERP_SHIFT_PS
-    packssdw    m3,     m5
-%endif
-    movdqu      [r2 + %1], m3
-%endmacro
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
@@ -652,96 +447,6 @@
     RET
 %endmacro
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-
-FILTER_HOR_CHROMA_sse3 2, 4, pp
-FILTER_HOR_CHROMA_sse3 2, 8, pp
-FILTER_HOR_CHROMA_sse3 2, 16, pp
-FILTER_HOR_CHROMA_sse3 4, 2, pp
-FILTER_HOR_CHROMA_sse3 4, 4, pp
-FILTER_HOR_CHROMA_sse3 4, 8, pp
-FILTER_HOR_CHROMA_sse3 4, 16, pp
-FILTER_HOR_CHROMA_sse3 4, 32, pp
-FILTER_HOR_CHROMA_sse3 6, 8, pp
-FILTER_HOR_CHROMA_sse3 6, 16, pp
-FILTER_HOR_CHROMA_sse3 8, 2, pp
-FILTER_HOR_CHROMA_sse3 8, 4, pp
-FILTER_HOR_CHROMA_sse3 8, 6, pp
-FILTER_HOR_CHROMA_sse3 8, 8, pp
-FILTER_HOR_CHROMA_sse3 8, 12, pp
-FILTER_HOR_CHROMA_sse3 8, 16, pp
-FILTER_HOR_CHROMA_sse3 8, 32, pp
-FILTER_HOR_CHROMA_sse3 8, 64, pp
-FILTER_HOR_CHROMA_sse3 12, 16, pp
-FILTER_HOR_CHROMA_sse3 12, 32, pp
-FILTER_HOR_CHROMA_sse3 16, 4, pp
-FILTER_HOR_CHROMA_sse3 16, 8, pp
-FILTER_HOR_CHROMA_sse3 16, 12, pp
-FILTER_HOR_CHROMA_sse3 16, 16, pp
-FILTER_HOR_CHROMA_sse3 16, 24, pp
-FILTER_HOR_CHROMA_sse3 16, 32, pp
-FILTER_HOR_CHROMA_sse3 16, 64, pp
-FILTER_HOR_CHROMA_sse3 24, 32, pp
-FILTER_HOR_CHROMA_sse3 24, 64, pp
-FILTER_HOR_CHROMA_sse3 32, 8, pp
-FILTER_HOR_CHROMA_sse3 32, 16, pp
-FILTER_HOR_CHROMA_sse3 32, 24, pp
-FILTER_HOR_CHROMA_sse3 32, 32, pp
-FILTER_HOR_CHROMA_sse3 32, 48, pp
-FILTER_HOR_CHROMA_sse3 32, 64, pp
-FILTER_HOR_CHROMA_sse3 48, 64, pp
-FILTER_HOR_CHROMA_sse3 64, 16, pp
-FILTER_HOR_CHROMA_sse3 64, 32, pp
-FILTER_HOR_CHROMA_sse3 64, 48, pp
-FILTER_HOR_CHROMA_sse3 64, 64, pp
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-
-FILTER_HOR_CHROMA_sse3 2, 4, ps
-FILTER_HOR_CHROMA_sse3 2, 8, ps
-FILTER_HOR_CHROMA_sse3 2, 16, ps
-FILTER_HOR_CHROMA_sse3 4, 2, ps
-FILTER_HOR_CHROMA_sse3 4, 4, ps
-FILTER_HOR_CHROMA_sse3 4, 8, ps
-FILTER_HOR_CHROMA_sse3 4, 16, ps
-FILTER_HOR_CHROMA_sse3 4, 32, ps
-FILTER_HOR_CHROMA_sse3 6, 8, ps
-FILTER_HOR_CHROMA_sse3 6, 16, ps
-FILTER_HOR_CHROMA_sse3 8, 2, ps
-FILTER_HOR_CHROMA_sse3 8, 4, ps
-FILTER_HOR_CHROMA_sse3 8, 6, ps
-FILTER_HOR_CHROMA_sse3 8, 8, ps
-FILTER_HOR_CHROMA_sse3 8, 12, ps
-FILTER_HOR_CHROMA_sse3 8, 16, ps
-FILTER_HOR_CHROMA_sse3 8, 32, ps
-FILTER_HOR_CHROMA_sse3 8, 64, ps
-FILTER_HOR_CHROMA_sse3 12, 16, ps
-FILTER_HOR_CHROMA_sse3 12, 32, ps
-FILTER_HOR_CHROMA_sse3 16, 4, ps
-FILTER_HOR_CHROMA_sse3 16, 8, ps
-FILTER_HOR_CHROMA_sse3 16, 12, ps
-FILTER_HOR_CHROMA_sse3 16, 16, ps
-FILTER_HOR_CHROMA_sse3 16, 24, ps
-FILTER_HOR_CHROMA_sse3 16, 32, ps
-FILTER_HOR_CHROMA_sse3 16, 64, ps
-FILTER_HOR_CHROMA_sse3 24, 32, ps
-FILTER_HOR_CHROMA_sse3 24, 64, ps
-FILTER_HOR_CHROMA_sse3 32, 8, ps
-FILTER_HOR_CHROMA_sse3 32, 16, ps
-FILTER_HOR_CHROMA_sse3 32, 24, ps
-FILTER_HOR_CHROMA_sse3 32, 32, ps
-FILTER_HOR_CHROMA_sse3 32, 48, ps
-FILTER_HOR_CHROMA_sse3 32, 64, ps
-FILTER_HOR_CHROMA_sse3 48, 64, ps
-FILTER_HOR_CHROMA_sse3 64, 16, ps
-FILTER_HOR_CHROMA_sse3 64, 32, ps
-FILTER_HOR_CHROMA_sse3 64, 48, ps
-FILTER_HOR_CHROMA_sse3 64, 64, ps
-
 %macro FILTER_P2S_2_4_sse2 1
     movd        m0,     [r0 + %1]
     movd        m2,     [r0 + r1 * 2 + %1]
@@ -1590,59 +1295,6 @@
 ;----------------------------------------------------------------------------------------------------------------------------
 FILTER_HOR_LUMA_W24 24, 32, ps
 
-%macro FILTER_W2_2 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + r1]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    packusdw    m3,         m3
-    CLIPW       m3,         m7,    m6
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    packssdw    m3,         m3
-%endif
-    movd        [r2],       m3
-    pextrd      [r2 + r3],  m3, 1
-%endmacro
-
-%macro FILTER_W4_2 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + r1]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + r1 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m7,    m6
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + r3],  m3
-%endmacro
-
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
@@ -2352,1415 +2004,6 @@
     jnz              .loop
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_CHROMA_H 6
-INIT_XMM sse4
-cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5
-
-    add         r3,       r3
-    add         r1,       r1
-    sub         r0,       2
-    mov         r4d,      r4m
-    add         r4d,      r4d
-
-%ifdef PIC
-    lea         r%6,      [tab_ChromaCoeff]
-    movh        m0,       [r%6 + r4 * 4]
-%else
-    movh        m0,       [tab_ChromaCoeff + r4 * 4]
-%endif
-
-    punpcklqdq  m0,       m0
-    mova        m2,       [tab_Tm16]
-
-%ifidn %3, ps
-    mova        m1,       [INTERP_OFFSET_PS]
-    cmp         r5m, byte 0
-    je          .skip
-    sub         r0,       r1
-    movu        m3,       [r0]
-    pshufb      m3,       m3, m2
-    pmaddwd     m3,       m0
-
-  %if %1 == 4
-    movu        m4,       [r0 + 4]
-    pshufb      m4,       m4, m2
-    pmaddwd     m4,       m0
-    phaddd      m3,       m4
-  %else
-    phaddd      m3,       m3
-  %endif
-
-    paddd       m3,       m1
-    psrad       m3,       INTERP_SHIFT_PS
-    packssdw    m3,       m3
-
-  %if %1 == 2
-    movd        [r2],     m3
-  %else
-    movh        [r2],     m3
-  %endif
-
-    add         r0,       r1
-    add         r2,       r3
-    FILTER_W%1_2 %3
-    lea         r0,       [r0 + 2 * r1]
-    lea         r2,       [r2 + 2 * r3]
-
-.skip:
-
-%else     ;%ifidn %3, ps
-    pxor        m7,       m7
-    mova        m6,       [pw_pixel_max]
-    mova        m1,       [tab_c_32]
-%endif    ;%ifidn %3, ps
-
-    FILTER_W%1_2 %3
-
-%rep (%2/2) - 1
-    lea         r0,       [r0 + 2 * r1]
-    lea         r2,       [r2 + 2 * r3]
-    FILTER_W%1_2 %3
-%endrep
-    RET
-%endmacro
-
-FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
-FILTER_CHROMA_H 2, 8, pp, 6, 8, 5
-FILTER_CHROMA_H 4, 2, pp, 6, 8, 5
-FILTER_CHROMA_H 4, 4, pp, 6, 8, 5
-FILTER_CHROMA_H 4, 8, pp, 6, 8, 5
-FILTER_CHROMA_H 4, 16, pp, 6, 8, 5
-
-FILTER_CHROMA_H 2, 4, ps, 7, 5, 6
-FILTER_CHROMA_H 2, 8, ps, 7, 5, 6
-FILTER_CHROMA_H 4, 2, ps, 7, 6, 6
-FILTER_CHROMA_H 4, 4, ps, 7, 6, 6
-FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
-FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
-
-FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
-FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
-FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
-FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
-
-
-%macro FILTER_W6_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m4,         [r0 + 8]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m4,         m4
-    paddd       m4,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m4,         INTERP_SHIFT_PP
-    packusdw    m3,         m4
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m4,         INTERP_SHIFT_PS
-    packssdw    m3,         m4
-%endif
-    movh        [r2],       m3
-    pextrd      [r2 + 8],   m3, 2
-%endmacro
-
-cglobal chroma_filter_pp_6x1_internal
-    FILTER_W6_1 pp
-    ret
-
-cglobal chroma_filter_ps_6x1_internal
-    FILTER_W6_1 ps
-    ret
-
-%macro FILTER_W8_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + 8],   m3
-%endmacro
-
-cglobal chroma_filter_pp_8x1_internal
-    FILTER_W8_1 pp
-    ret
-
-cglobal chroma_filter_ps_8x1_internal
-    FILTER_W8_1 ps
-    ret
-
-%macro FILTER_W12_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + 8],   m3
-
-    movu        m3,         [r0 + 16]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 20]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    packusdw    m3,         m3
-    CLIPW       m3,         m6, m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    packssdw    m3,         m3
-%endif
-    movh        [r2 + 16],  m3
-%endmacro
-
-cglobal chroma_filter_pp_12x1_internal
-    FILTER_W12_1 pp
-    ret
-
-cglobal chroma_filter_ps_12x1_internal
-    FILTER_W12_1 ps
-    ret
-
-%macro FILTER_W16_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + 8],   m3
-
-    movu        m3,         [r0 + 16]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 20]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 24]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 28]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 16],  m3
-    movhps      [r2 + 24],  m3
-%endmacro
-
-cglobal chroma_filter_pp_16x1_internal
-    FILTER_W16_1 pp
-    ret
-
-cglobal chroma_filter_ps_16x1_internal
-    FILTER_W16_1 ps
-    ret
-
-%macro FILTER_W24_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + 8],   m3
-
-    movu        m3,         [r0 + 16]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 20]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 24]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 28]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 16],  m3
-    movhps      [r2 + 24],  m3
-
-    movu        m3,         [r0 + 32]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 36]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 40]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 44]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 32],  m3
-    movhps      [r2 + 40],  m3
-%endmacro
-
-cglobal chroma_filter_pp_24x1_internal
-    FILTER_W24_1 pp
-    ret
-
-cglobal chroma_filter_ps_24x1_internal
-    FILTER_W24_1 ps
-    ret
-
-%macro FILTER_W32_1 1
-    movu        m3,         [r0]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2],       m3
-    movhps      [r2 + 8],   m3
-
-    movu        m3,         [r0 + 16]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 20]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 24]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 28]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 16],  m3
-    movhps      [r2 + 24],  m3
-
-    movu        m3,         [r0 + 32]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 36]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 40]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 44]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 32],  m3
-    movhps      [r2 + 40],  m3
-
-    movu        m3,         [r0 + 48]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + 52]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + 56]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + 60]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + 48],  m3
-    movhps      [r2 + 56],  m3
-%endmacro
-
-cglobal chroma_filter_pp_32x1_internal
-    FILTER_W32_1 pp
-    ret
-
-cglobal chroma_filter_ps_32x1_internal
-    FILTER_W32_1 ps
-    ret
-
-%macro FILTER_W8o_1 2
-    movu        m3,         [r0 + %2]
-    pshufb      m3,         m3, m2
-    pmaddwd     m3,         m0
-    movu        m4,         [r0 + %2 + 4]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m3,         m4
-    paddd       m3,         m1
-
-    movu        m5,         [r0 + %2 + 8]
-    pshufb      m5,         m5, m2
-    pmaddwd     m5,         m0
-    movu        m4,         [r0 + %2 + 12]
-    pshufb      m4,         m4, m2
-    pmaddwd     m4,         m0
-    phaddd      m5,         m4
-    paddd       m5,         m1
-%ifidn %1, pp
-    psrad       m3,         INTERP_SHIFT_PP
-    psrad       m5,         INTERP_SHIFT_PP
-    packusdw    m3,         m5
-    CLIPW       m3,         m6,    m7
-%else
-    psrad       m3,         INTERP_SHIFT_PS
-    psrad       m5,         INTERP_SHIFT_PS
-    packssdw    m3,         m5
-%endif
-    movh        [r2 + %2],       m3
-    movhps      [r2 + %2 + 8],   m3
-%endmacro
-
-%macro FILTER_W48_1 1
-    FILTER_W8o_1 %1, 0
-    FILTER_W8o_1 %1, 16
-    FILTER_W8o_1 %1, 32
-    FILTER_W8o_1 %1, 48
-    FILTER_W8o_1 %1, 64
-    FILTER_W8o_1 %1, 80
-%endmacro
-
-cglobal chroma_filter_pp_48x1_internal
-    FILTER_W48_1 pp
-    ret
-
-cglobal chroma_filter_ps_48x1_internal
-    FILTER_W48_1 ps
-    ret
-
-%macro FILTER_W64_1 1
-    FILTER_W8o_1 %1, 0
-    FILTER_W8o_1 %1, 16
-    FILTER_W8o_1 %1, 32
-    FILTER_W8o_1 %1, 48
-    FILTER_W8o_1 %1, 64
-    FILTER_W8o_1 %1, 80
-    FILTER_W8o_1 %1, 96
-    FILTER_W8o_1 %1, 112
-%endmacro
-
-cglobal chroma_filter_pp_64x1_internal
-    FILTER_W64_1 pp
-    ret
-
-cglobal chroma_filter_ps_64x1_internal
-    FILTER_W64_1 ps
-    ret
-    
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-
-INIT_XMM sse4
-%macro IPFILTER_CHROMA 6
-cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6
-
-    add         r3,        r3
-    add         r1,        r1
-    sub         r0,         2
-    mov         r4d,        r4m
-    add         r4d,        r4d
-
-%ifdef PIC
-    lea         r%4,       [tab_ChromaCoeff]
-    movh        m0,       [r%4 + r4 * 4]
-%else
-    movh        m0,       [tab_ChromaCoeff + r4 * 4]
-%endif
-
-    punpcklqdq  m0,       m0
-    mova        m2,       [tab_Tm16]
-
-%ifidn %3, ps
-    mova        m1,       [INTERP_OFFSET_PS]
-    cmp         r5m, byte 0
-    je          .skip
-    sub         r0, r1
-    call chroma_filter_%3_%1x1_internal
-    add         r0, r1
-    add         r2, r3
-    call chroma_filter_%3_%1x1_internal
-    add         r0, r1
-    add         r2, r3
-    call chroma_filter_%3_%1x1_internal
-    add         r0, r1
-    add         r2, r3
-.skip:
-%else
-    mova        m1,         [tab_c_32]
-    pxor        m6,         m6
-    mova        m7,         [pw_pixel_max]
-%endif
-
-    call chroma_filter_%3_%1x1_internal
-%rep %2 - 1
-    add         r0,       r1
-    add         r2,       r3
-    call chroma_filter_%3_%1x1_internal
-%endrep
-RET
-%endmacro
-IPFILTER_CHROMA 6, 8, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 2, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 4, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 6, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 8, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 32, pp, 5, 6, 8
-IPFILTER_CHROMA 12, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 4, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 8, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 12, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 32, pp, 5, 6, 8
-IPFILTER_CHROMA 24, 32, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 8, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 24, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 32, pp, 5, 6, 8
-
-IPFILTER_CHROMA 6, 8, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 2, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 4, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 6, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 8, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 16, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 32, ps, 6, 7, 6
-IPFILTER_CHROMA 12, 16, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 4, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 8, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 12, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 16, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 32, ps, 6, 7, 6
-IPFILTER_CHROMA 24, 32, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 8, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 16, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
-
-IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
-IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
-IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
-IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
-IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
-IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
-IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
-IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
-IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
-
-IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
-IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
-IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
-IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
-IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
-IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
-IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
-IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
-IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_6xN 1
-cglobal interp_4tap_horiz_pp_6x%1, 5,6,8
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, %1/2
-.loop:
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movq            [r2], xm3
-    pextrd          [r2 + 8], xm3, 2
-
-    vbroadcasti128  m3, [r0 + r1]
-    vbroadcasti128  m4, [r0 + r1 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movq            [r2 + r3], xm3
-    pextrd          [r2 + r3 + 8], xm3, 2
-
-    lea             r0, [r0 + r1 * 2]
-    lea             r2, [r2 + r3 * 2]
-    dec             r4d
-    jnz             .loop
-    RET
-%endmacro
-IPFILTER_CHROMA_avx2_6xN 8
-IPFILTER_CHROMA_avx2_6xN 16
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-cglobal interp_4tap_horiz_pp_8x2, 5,6,8
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, INTERP_SHIFT_PP          ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3,q2020
-    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2], xm3
-
-    vbroadcasti128  m3, [r0 + r1]
-    vbroadcasti128  m4, [r0 + r1 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3,q2020
-    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2 + r3], xm3
-    RET
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-cglobal interp_4tap_horiz_pp_8x4, 5,6,8
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-%rep 2
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3,q2020
-    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2], xm3
-
-    vbroadcasti128  m3, [r0 + r1]
-    vbroadcasti128  m4, [r0 + r1 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3,q2020
-    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2 + r3], xm3
-
-    lea             r0, [r0 + r1 * 2]
-    lea             r2, [r2 + r3 * 2]
-%endrep
-    RET
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_8xN 1
-cglobal interp_4tap_horiz_pp_8x%1, 5,6,8
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, %1/2
-.loop:
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2], xm3
-
-    vbroadcasti128  m3, [r0 + r1]
-    vbroadcasti128  m4, [r0 + r1 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2 + r3], xm3
-
-    lea             r0, [r0 + r1 * 2]
-    lea             r2, [r2 + r3 * 2]
-    dec             r4d
-    jnz             .loop
-    RET
-%endmacro
-IPFILTER_CHROMA_avx2_8xN 6
-IPFILTER_CHROMA_avx2_8xN 8
-IPFILTER_CHROMA_avx2_8xN 12
-IPFILTER_CHROMA_avx2_8xN 16
-IPFILTER_CHROMA_avx2_8xN 32
-IPFILTER_CHROMA_avx2_8xN 64
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_16xN 1
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_16x%1, 5,6,9
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, %1
-.loop:
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
-
-    vbroadcasti128  m4, [r0 + 16]
-    vbroadcasti128  m8, [r0 + 24]
-
-    pshufb          m4, m1
-    pshufb          m8, m1
-
-    pmaddwd         m4, m0
-    pmaddwd         m8, m0
-    phaddd          m4, m8
-    paddd           m4, m2
-    psrad           m4, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m4, m4
-    vpermq          m4, m4, q2020
-    pshufb          xm4, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
-    vinserti128     m3, m3, xm4, 1
-    CLIPW           m3, m5, m7
-    movu            [r2], m3
-
-    add             r0, r1
-    add             r2, r3
-    dec             r4d
-    jnz             .loop
-    RET
-%endif
-%endmacro
-IPFILTER_CHROMA_avx2_16xN 4
-IPFILTER_CHROMA_avx2_16xN 8
-IPFILTER_CHROMA_avx2_16xN 12
-IPFILTER_CHROMA_avx2_16xN 16
-IPFILTER_CHROMA_avx2_16xN 24
-IPFILTER_CHROMA_avx2_16xN 32
-IPFILTER_CHROMA_avx2_16xN 64
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_32xN 1
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_32x%1, 5,6,9
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r6d, %1
-.loop:
-%assign x 0
-%rep 2
-    vbroadcasti128  m3, [r0 + x]
-    vbroadcasti128  m4, [r0 + 8 + x]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
-
-    vbroadcasti128  m4, [r0 + 16 + x]
-    vbroadcasti128  m8, [r0 + 24 + x]
-    pshufb          m4, m1
-    pshufb          m8, m1
-
-    pmaddwd         m4, m0
-    pmaddwd         m8, m0
-    phaddd          m4, m8
-    paddd           m4, m2
-    psrad           m4, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m4, m4
-    vpermq          m4, m4, q2020
-    pshufb          xm4, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
-    vinserti128     m3, m3, xm4, 1
-    CLIPW           m3, m5, m7
-    movu            [r2 + x], m3
-    %assign x x+32
-    %endrep
-
-    add             r0, r1
-    add             r2, r3
-    dec             r6d
-    jnz             .loop
-    RET
-%endif
-%endmacro
-IPFILTER_CHROMA_avx2_32xN 8
-IPFILTER_CHROMA_avx2_32xN 16
-IPFILTER_CHROMA_avx2_32xN 24
-IPFILTER_CHROMA_avx2_32xN 32
-IPFILTER_CHROMA_avx2_32xN 48
-IPFILTER_CHROMA_avx2_32xN 64
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_12xN 1
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_12x%1, 5,6,8
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, %1
-.loop:
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movu            [r2], xm3
-
-    vbroadcasti128  m3, [r0 + 16]
-    vbroadcasti128  m4, [r0 + 24]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
-    CLIPW           xm3, xm5, xm7
-    movq            [r2 + 16], xm3
-
-    add             r0, r1
-    add             r2, r3
-    dec             r4d
-    jnz             .loop
-    RET
-%endif
-%endmacro
-IPFILTER_CHROMA_avx2_12xN 16
-IPFILTER_CHROMA_avx2_12xN 32
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_24xN 1
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_24x%1, 5,6,9
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, %1
-.loop:
-    vbroadcasti128  m3, [r0]
-    vbroadcasti128  m4, [r0 + 8]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6
-
-    vbroadcasti128  m4, [r0 + 16]
-    vbroadcasti128  m8, [r0 + 24]
-    pshufb          m4, m1
-    pshufb          m8, m1
-
-    pmaddwd         m4, m0
-    pmaddwd         m8, m0
-    phaddd          m4, m8
-    paddd           m4, m2
-    psrad           m4, 6
-
-    packusdw        m3, m4
-    vpermq          m3, m3, q3120
-    pshufb          m3, m6
-    CLIPW           m3, m5, m7
-    movu            [r2], m3
-
-    vbroadcasti128  m3, [r0 + 32]
-    vbroadcasti128  m4, [r0 + 40]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6
-
-    packusdw        m3, m3
-    vpermq          m3, m3, q2020
-    pshufb          xm3, xm6
-    CLIPW           xm3, xm5, xm7
-    movu            [r2 + 32], xm3
-
-    add             r0, r1
-    add             r2, r3
-    dec             r4d
-    jnz             .loop
-    RET
-%endif
-%endmacro
-IPFILTER_CHROMA_avx2_24xN 32
-IPFILTER_CHROMA_avx2_24xN 64
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%macro IPFILTER_CHROMA_avx2_64xN 1
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_64x%1, 5,6,9
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r6d, %1
-.loop:
-%assign x 0
-%rep 4
-    vbroadcasti128  m3, [r0 + x]
-    vbroadcasti128  m4, [r0 + 8 + x]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6
-
-    vbroadcasti128  m4, [r0 + 16 + x]
-    vbroadcasti128  m8, [r0 + 24 + x]
-    pshufb          m4, m1
-    pshufb          m8, m1
-
-    pmaddwd         m4, m0
-    pmaddwd         m8, m0
-    phaddd          m4, m8
-    paddd           m4, m2
-    psrad           m4, 6
-
-    packusdw        m3, m4
-    vpermq          m3, m3, q3120
-    pshufb          m3, m6
-    CLIPW           m3, m5, m7
-    movu            [r2 + x], m3
-    %assign x x+32
-    %endrep
-
-    add             r0, r1
-    add             r2, r3
-    dec             r6d
-    jnz             .loop
-    RET
-%endif
-%endmacro
-IPFILTER_CHROMA_avx2_64xN 16
-IPFILTER_CHROMA_avx2_64xN 32
-IPFILTER_CHROMA_avx2_64xN 48
-IPFILTER_CHROMA_avx2_64xN 64
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
-;-------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2
-%if ARCH_X86_64
-cglobal interp_4tap_horiz_pp_48x64, 5,6,9
-    add             r1d, r1d
-    add             r3d, r3d
-    sub             r0, 2
-    mov             r4d, r4m
-%ifdef PIC
-    lea             r5, [tab_ChromaCoeff]
-    vpbroadcastq    m0, [r5 + r4 * 8]
-%else
-    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova            m1, [interp8_hpp_shuf]
-    vpbroadcastd    m2, [pd_32]
-    pxor            m5, m5
-    mova            m6, [idct8_shuf2]
-    mova            m7, [pw_pixel_max]
-
-    mov             r4d, 64
-.loop:
-%assign x 0
-%rep 3
-    vbroadcasti128  m3, [r0 + x]
-    vbroadcasti128  m4, [r0 + 8 + x]
-    pshufb          m3, m1
-    pshufb          m4, m1
-
-    pmaddwd         m3, m0
-    pmaddwd         m4, m0
-    phaddd          m3, m4
-    paddd           m3, m2
-    psrad           m3, 6
-
-    vbroadcasti128  m4, [r0 + 16 + x]
-    vbroadcasti128  m8, [r0 + 24 + x]
-    pshufb          m4, m1
-    pshufb          m8, m1
-
-    pmaddwd         m4, m0
-    pmaddwd         m8, m0
-    phaddd          m4, m8
-    paddd           m4, m2
-    psrad           m4, 6
-
-    packusdw        m3, m4
-    vpermq          m3, m3, q3120
-    pshufb          m3, m6
-    CLIPW           m3, m5, m7
-    movu            [r2 + x], m3
-%assign x x+32
-%endrep
-
-    add             r0, r1
-    add             r2, r3
-    dec             r4d
-    jnz             .loop
-    RET
-%endif
-
 ;-----------------------------------------------------------------------------------------------------------------------------
 ;void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
@@ -4292,709 +2535,3 @@
     jnz                 .loop0
     RET
 %endif
-%macro IPFILTER_CHROMA_PS_8xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-    IPFILTER_CHROMA_PS_8xN_AVX2 4
-    IPFILTER_CHROMA_PS_8xN_AVX2 8
-    IPFILTER_CHROMA_PS_8xN_AVX2 16
-    IPFILTER_CHROMA_PS_8xN_AVX2 32
-    IPFILTER_CHROMA_PS_8xN_AVX2 6
-    IPFILTER_CHROMA_PS_8xN_AVX2 2
-    IPFILTER_CHROMA_PS_8xN_AVX2 12
-    IPFILTER_CHROMA_PS_8xN_AVX2 64
-
-%macro IPFILTER_CHROMA_PS_16xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    vbroadcasti128      m5, [r0 + 24]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 16], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-IPFILTER_CHROMA_PS_16xN_AVX2 16
-IPFILTER_CHROMA_PS_16xN_AVX2 8
-IPFILTER_CHROMA_PS_16xN_AVX2 32
-IPFILTER_CHROMA_PS_16xN_AVX2 12
-IPFILTER_CHROMA_PS_16xN_AVX2 4
-IPFILTER_CHROMA_PS_16xN_AVX2 64
-IPFILTER_CHROMA_PS_16xN_AVX2 24
-
-%macro IPFILTER_CHROMA_PS_24xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    vbroadcasti128      m5, [r0 + 24]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 16], xm4
-
-    vbroadcasti128      m4, [r0 + 32]
-    vbroadcasti128      m5, [r0 + 40]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 32], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-IPFILTER_CHROMA_PS_24xN_AVX2 32
-IPFILTER_CHROMA_PS_24xN_AVX2 64
-
-%macro IPFILTER_CHROMA_PS_12xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    pshufb              m4, m3
-    pmaddwd             m4, m0
-    phaddd              m4, m4
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movq                [r2 + 16], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-IPFILTER_CHROMA_PS_12xN_AVX2 16
-IPFILTER_CHROMA_PS_12xN_AVX2 32
-
-%macro IPFILTER_CHROMA_PS_32xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    vbroadcasti128      m5, [r0 + 24]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 16], xm4
-
-    vbroadcasti128      m4, [r0 + 32]
-    vbroadcasti128      m5, [r0 + 40]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 32], xm4
-
-    vbroadcasti128      m4, [r0 + 48]
-    vbroadcasti128      m5, [r0 + 56]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 48], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-IPFILTER_CHROMA_PS_32xN_AVX2 32
-IPFILTER_CHROMA_PS_32xN_AVX2 16
-IPFILTER_CHROMA_PS_32xN_AVX2 24
-IPFILTER_CHROMA_PS_32xN_AVX2 8
-IPFILTER_CHROMA_PS_32xN_AVX2 64
-IPFILTER_CHROMA_PS_32xN_AVX2 48
-
-
-%macro IPFILTER_CHROMA_PS_64xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    vbroadcasti128      m5, [r0 + 24]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 16], xm4
-
-    vbroadcasti128      m4, [r0 + 32]
-    vbroadcasti128      m5, [r0 + 40]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 32], xm4
-
-    vbroadcasti128      m4, [r0 + 48]
-    vbroadcasti128      m5, [r0 + 56]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 48], xm4
-
-    vbroadcasti128      m4, [r0 + 64]
-    vbroadcasti128      m5, [r0 + 72]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 64], xm4
-
-    vbroadcasti128      m4, [r0 + 80]
-    vbroadcasti128      m5, [r0 + 88]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 80], xm4
-
-    vbroadcasti128      m4, [r0 + 96]
-    vbroadcasti128      m5, [r0 + 104]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 96], xm4
-
-    vbroadcasti128      m4, [r0 + 112]
-    vbroadcasti128      m5, [r0 + 120]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 112], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-IPFILTER_CHROMA_PS_64xN_AVX2 64
-IPFILTER_CHROMA_PS_64xN_AVX2 48
-IPFILTER_CHROMA_PS_64xN_AVX2 32
-IPFILTER_CHROMA_PS_64xN_AVX2 16
-
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, 64
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2], xm4
-
-    vbroadcasti128      m4, [r0 + 16]
-    vbroadcasti128      m5, [r0 + 24]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 16], xm4
-
-    vbroadcasti128      m4, [r0 + 32]
-    vbroadcasti128      m5, [r0 + 40]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 32], xm4
-
-    vbroadcasti128      m4, [r0 + 48]
-    vbroadcasti128      m5, [r0 + 56]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 48], xm4
-
-    vbroadcasti128      m4, [r0 + 64]
-    vbroadcasti128      m5, [r0 + 72]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 64], xm4
-
-    vbroadcasti128      m4, [r0 + 80]
-    vbroadcasti128      m5, [r0 + 88]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movu                [r2 + 80], xm4
-
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-
-%macro IPFILTER_CHROMA_PS_6xN_AVX2 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6
-    add                 r1d, r1d
-    add                 r3d, r3d
-    mov                 r4d, r4m
-    mov                 r5d, r5m
-
-%ifdef PIC
-    lea                 r6, [tab_ChromaCoeff]
-    vpbroadcastq        m0, [r6 + r4 * 8]
-%else
-    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
-%endif
-    mova                m3, [interp8_hpp_shuf]
-    vbroadcasti128      m2, [INTERP_OFFSET_PS]
-
-    ; register map
-    ; m0 , m1 interpolate coeff
-
-    sub                 r0, 2
-    test                r5d, r5d
-    mov                 r4d, %1
-    jz                  .loop0
-    sub                 r0, r1
-    add                 r4d, 3
-
-.loop0:
-    vbroadcasti128      m4, [r0]
-    vbroadcasti128      m5, [r0 + 8]
-    pshufb              m4, m3
-    pshufb              m5, m3
-    pmaddwd             m4, m0
-    pmaddwd             m5, m0
-    phaddd              m4, m5
-    paddd               m4, m2
-    vpermq              m4, m4, q3120
-    psrad               m4, INTERP_SHIFT_PS
-    vextracti128        xm5, m4, 1
-    packssdw            xm4, xm5
-    movq                [r2], xm4
-    pextrd              [r2 + 8], xm4, 2
-    add                 r2, r3
-    add                 r0, r1
-    dec                 r4d
-    jnz                 .loop0
-    RET
-%endif
-%endmacro
-
-    IPFILTER_CHROMA_PS_6xN_AVX2 8
-    IPFILTER_CHROMA_PS_6xN_AVX2 16
diff -r 2f381d267c11 -r 53ecf4111573 source/common/x86/h4-ipfilter16.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/h4-ipfilter16.asm	Tue Feb 13 00:28:25 2018 -0800
@@ -0,0 +1,2632 @@
+;*****************************************************************************
+;* Copyright (C) 2013-2017 MulticoreWare, Inc
+;*
+;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
+;*          Murugan Vairavel <murugan at multicorewareinc.com>
+;*          Min Chen <chenm003 at 163.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+
+%define INTERP_OFFSET_PP        pd_32
+%define INTERP_SHIFT_PP         6
+
+%if BIT_DEPTH == 10
+    %define INTERP_SHIFT_PS         2
+    %define INTERP_OFFSET_PS        pd_n32768
+    %define INTERP_SHIFT_SP         10
+    %define INTERP_OFFSET_SP        h4_pd_524800
+%elif BIT_DEPTH == 12
+    %define INTERP_SHIFT_PS         4
+    %define INTERP_OFFSET_PS        pd_n131072
+    %define INTERP_SHIFT_SP         8
+    %define INTERP_OFFSET_SP        pd_524416
+%else
+    %error Unsupport bit depth!
+%endif
+
+
+SECTION_RODATA 32
+
+tab_c_32:         times 8 dd 32
+h4_pd_524800:        times 8 dd 524800
+
+tab_Tm16:         db 0, 1, 2, 3, 4,  5,  6, 7, 2, 3, 4,  5, 6, 7, 8, 9
+
+tab_ChromaCoeff:  dw  0, 64,  0,  0
+                  dw -2, 58, 10, -2
+                  dw -4, 54, 16, -2
+                  dw -6, 46, 28, -4
+                  dw -4, 36, 36, -4
+                  dw -4, 28, 46, -6
+                  dw -2, 16, 54, -4
+                  dw -2, 10, 58, -2
+
+const h4_interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
+
+SECTION .text
+cextern pd_8
+cextern pd_32
+cextern pw_pixel_max
+cextern pd_524416
+cextern pd_n32768
+cextern pd_n131072
+cextern pw_2000
+cextern idct8_shuf2
+
+%macro FILTERH_W2_4_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + r1 + %1]
+    movhps      m4,     [r0 + r1 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+    movh        m5,     [r0 + 2 * r1 + %1]
+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + r4 + %1]
+    movhps      m4,     [r0 + r4 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     INTERP_SHIFT_PS
+    psrad       m5,     INTERP_SHIFT_PS
+    packssdw    m3,     m5
+%endif
+    movd        [r2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 * 2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r5 + %1], m3
+%endmacro
+
+%macro FILTERH_W2_3_sse3 1
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + r1 + %1]
+    movhps      m4,     [r0 + r1 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + 2 * r1 + %1]
+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m5,     m0
+
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m5,     m5,     q3120
+    paddd       m5,     m1
+
+    psrad       m3,     INTERP_SHIFT_PS
+    psrad       m5,     INTERP_SHIFT_PS
+    packssdw    m3,     m5
+
+    movd        [r2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_2_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + %1 + 4]
+    movhps      m4,     [r0 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + r1 + %1]
+    movhps      m5,     [r0 + r1 + %1 + 2]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + r1 + %1 + 4]
+    movhps      m4,     [r0 + r1 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     INTERP_SHIFT_PS
+    psrad       m5,     INTERP_SHIFT_PS
+    packssdw    m3,     m5
+%endif
+    movh        [r2 + %1], m3
+    movhps      [r2 + r3 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_1_sse3 1
+    movh        m3,     [r0 + 2 * r1 + %1]
+    movhps      m3,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + 2 * r1 + %1 + 4]
+    movhps      m4,     [r0 + 2 * r1 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    psrad       m3,     INTERP_SHIFT_PS
+    packssdw    m3,     m3
+    movh        [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W8_1_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + %1 + 4]
+    movhps      m4,     [r0 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + %1 + 8]
+    movhps      m5,     [r0 + %1 + 10]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + %1 + 12]
+    movhps      m4,     [r0 + %1 + 14]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     INTERP_SHIFT_PS
+    psrad       m5,     INTERP_SHIFT_PS
+    packssdw    m3,     m5
+%endif
+    movdqu      [r2 + %1], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_HOR_CHROMA_sse3 3
+INIT_XMM sse3
+cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8
+    add         r3,     r3
+    add         r1,     r1
+    sub         r0,     2
+    mov         r4d,    r4m
+    add         r4d,    r4d
+
+%ifdef PIC
+    lea         r6,     [tab_ChromaCoeff]
+    movddup     m0,     [r6 + r4 * 4]
+%else
+    movddup     m0,     [tab_ChromaCoeff + r4 * 4]
+%endif
+
+%ifidn %3, ps
+    mova        m1,     [INTERP_OFFSET_PS]
+    cmp         r5m,    byte 0
+%if %1 <= 6
+    lea         r4,     [r1 * 3]
+    lea         r5,     [r3 * 3]
+%endif
+    je          .skip
+    sub         r0,     r1
+%if %1 <= 6
+%assign y 1
+%else
+%assign y 3
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+    FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0)
+    FILTERH_W4_2_sse3 x, %3
+    FILTERH_W4_1_sse3 x
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && z == 0)
+    FILTERH_W2_3_sse3 x
+%endif
+%if %1 <= 6
+    lea         r0,     [r0 + r4]
+    lea         r2,     [r2 + r5]
+%else
+    lea         r0,     [r0 + r1]
+    lea         r2,     [r2 + r3]
+%endif
+%assign z z+1
+%endrep
+.skip:
+%elifidn %3, pp
+    pxor        m7,     m7
+    mova        m6,     [pw_pixel_max]
+    mova        m1,     [tab_c_32]
+%if %1 == 2 || %1 == 6
+    lea         r4,     [r1 * 3]
+    lea         r5,     [r3 * 3]
+%endif
+%endif
+
+%if %1 == 2
+%assign y %2/4
+%elif %1 <= 6
+%assign y %2/2
+%else
+%assign y %2
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+    FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0)
+    FILTERH_W4_2_sse3 x, %3
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && (z % 2) == 0)
+    FILTERH_W2_4_sse3 x, %3
+%endif
+%assign z z+1
+%if z < y
+%if %1 == 2
+    lea         r0,     [r0 + 4 * r1]
+    lea         r2,     [r2 + 4 * r3]
+%elif %1 <= 6
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+%else
+    lea         r0,     [r0 + r1]
+    lea         r2,     [r2 + r3]
+%endif
+%endif ;z < y
+%endrep
+
+    RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, pp
+FILTER_HOR_CHROMA_sse3 2, 8, pp
+FILTER_HOR_CHROMA_sse3 2, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 2, pp
+FILTER_HOR_CHROMA_sse3 4, 4, pp
+FILTER_HOR_CHROMA_sse3 4, 8, pp
+FILTER_HOR_CHROMA_sse3 4, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 32, pp
+FILTER_HOR_CHROMA_sse3 6, 8, pp
+FILTER_HOR_CHROMA_sse3 6, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 2, pp
+FILTER_HOR_CHROMA_sse3 8, 4, pp
+FILTER_HOR_CHROMA_sse3 8, 6, pp
+FILTER_HOR_CHROMA_sse3 8, 8, pp
+FILTER_HOR_CHROMA_sse3 8, 12, pp
+FILTER_HOR_CHROMA_sse3 8, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 32, pp
+FILTER_HOR_CHROMA_sse3 8, 64, pp
+FILTER_HOR_CHROMA_sse3 12, 16, pp
+FILTER_HOR_CHROMA_sse3 12, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 4, pp
+FILTER_HOR_CHROMA_sse3 16, 8, pp
+FILTER_HOR_CHROMA_sse3 16, 12, pp
+FILTER_HOR_CHROMA_sse3 16, 16, pp
+FILTER_HOR_CHROMA_sse3 16, 24, pp
+FILTER_HOR_CHROMA_sse3 16, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 64, pp
+FILTER_HOR_CHROMA_sse3 24, 32, pp
+FILTER_HOR_CHROMA_sse3 24, 64, pp
+FILTER_HOR_CHROMA_sse3 32, 8, pp
+FILTER_HOR_CHROMA_sse3 32, 16, pp
+FILTER_HOR_CHROMA_sse3 32, 24, pp
+FILTER_HOR_CHROMA_sse3 32, 32, pp
+FILTER_HOR_CHROMA_sse3 32, 48, pp
+FILTER_HOR_CHROMA_sse3 32, 64, pp
+FILTER_HOR_CHROMA_sse3 48, 64, pp
+FILTER_HOR_CHROMA_sse3 64, 16, pp
+FILTER_HOR_CHROMA_sse3 64, 32, pp
+FILTER_HOR_CHROMA_sse3 64, 48, pp
+FILTER_HOR_CHROMA_sse3 64, 64, pp
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, ps
+FILTER_HOR_CHROMA_sse3 2, 8, ps
+FILTER_HOR_CHROMA_sse3 2, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 2, ps
+FILTER_HOR_CHROMA_sse3 4, 4, ps
+FILTER_HOR_CHROMA_sse3 4, 8, ps
+FILTER_HOR_CHROMA_sse3 4, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 32, ps
+FILTER_HOR_CHROMA_sse3 6, 8, ps
+FILTER_HOR_CHROMA_sse3 6, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 2, ps
+FILTER_HOR_CHROMA_sse3 8, 4, ps
+FILTER_HOR_CHROMA_sse3 8, 6, ps
+FILTER_HOR_CHROMA_sse3 8, 8, ps
+FILTER_HOR_CHROMA_sse3 8, 12, ps
+FILTER_HOR_CHROMA_sse3 8, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 32, ps
+FILTER_HOR_CHROMA_sse3 8, 64, ps
+FILTER_HOR_CHROMA_sse3 12, 16, ps
+FILTER_HOR_CHROMA_sse3 12, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 4, ps
+FILTER_HOR_CHROMA_sse3 16, 8, ps
+FILTER_HOR_CHROMA_sse3 16, 12, ps
+FILTER_HOR_CHROMA_sse3 16, 16, ps
+FILTER_HOR_CHROMA_sse3 16, 24, ps
+FILTER_HOR_CHROMA_sse3 16, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 64, ps
+FILTER_HOR_CHROMA_sse3 24, 32, ps
+FILTER_HOR_CHROMA_sse3 24, 64, ps
+FILTER_HOR_CHROMA_sse3 32, 8, ps
+FILTER_HOR_CHROMA_sse3 32, 16, ps
+FILTER_HOR_CHROMA_sse3 32, 24, ps
+FILTER_HOR_CHROMA_sse3 32, 32, ps
+FILTER_HOR_CHROMA_sse3 32, 48, ps
+FILTER_HOR_CHROMA_sse3 32, 64, ps
+FILTER_HOR_CHROMA_sse3 48, 64, ps
+FILTER_HOR_CHROMA_sse3 64, 16, ps
+FILTER_HOR_CHROMA_sse3 64, 32, ps
+FILTER_HOR_CHROMA_sse3 64, 48, ps
+FILTER_HOR_CHROMA_sse3 64, 64, ps
+
+%macro FILTER_W2_2 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + r1]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    packusdw    m3,         m3
+    CLIPW       m3,         m7,    m6
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    packssdw    m3,         m3
+%endif
+    movd        [r2],       m3
+    pextrd      [r2 + r3],  m3, 1
+%endmacro
+
+%macro FILTER_W4_2 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + r1]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + r1 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m7,    m6
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + r3],  m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_CHROMA_H 6
+INIT_XMM sse4
+cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5
+
+    add         r3,       r3
+    add         r1,       r1
+    sub         r0,       2
+    mov         r4d,      r4m
+    add         r4d,      r4d
+
+%ifdef PIC
+    lea         r%6,      [tab_ChromaCoeff]
+    movh        m0,       [r%6 + r4 * 4]
+%else
+    movh        m0,       [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    punpcklqdq  m0,       m0
+    mova        m2,       [tab_Tm16]
+
+%ifidn %3, ps
+    mova        m1,       [INTERP_OFFSET_PS]
+    cmp         r5m, byte 0
+    je          .skip
+    sub         r0,       r1
+    movu        m3,       [r0]
+    pshufb      m3,       m3, m2
+    pmaddwd     m3,       m0
+
+  %if %1 == 4
+    movu        m4,       [r0 + 4]
+    pshufb      m4,       m4, m2
+    pmaddwd     m4,       m0
+    phaddd      m3,       m4
+  %else
+    phaddd      m3,       m3
+  %endif
+
+    paddd       m3,       m1
+    psrad       m3,       INTERP_SHIFT_PS
+    packssdw    m3,       m3
+
+  %if %1 == 2
+    movd        [r2],     m3
+  %else
+    movh        [r2],     m3
+  %endif
+
+    add         r0,       r1
+    add         r2,       r3
+    FILTER_W%1_2 %3
+    lea         r0,       [r0 + 2 * r1]
+    lea         r2,       [r2 + 2 * r3]
+
+.skip:
+
+%else     ;%ifidn %3, ps
+    pxor        m7,       m7
+    mova        m6,       [pw_pixel_max]
+    mova        m1,       [tab_c_32]
+%endif    ;%ifidn %3, ps
+
+    FILTER_W%1_2 %3
+
+%rep (%2/2) - 1
+    lea         r0,       [r0 + 2 * r1]
+    lea         r2,       [r2 + 2 * r3]
+    FILTER_W%1_2 %3
+%endrep
+    RET
+%endmacro
+
+FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
+FILTER_CHROMA_H 2, 8, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 2, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 4, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 8, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 16, pp, 6, 8, 5
+
+FILTER_CHROMA_H 2, 4, ps, 7, 5, 6
+FILTER_CHROMA_H 2, 8, ps, 7, 5, 6
+FILTER_CHROMA_H 4, 2, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 4, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
+
+FILTER_CHROMA_H 2, 16, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 32, pp, 6, 8, 5
+FILTER_CHROMA_H 2, 16, ps, 7, 5, 6
+FILTER_CHROMA_H 4, 32, ps, 7, 6, 6
+
+
+%macro FILTER_W6_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m4,         [r0 + 8]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m4,         m4
+    paddd       m4,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m4,         INTERP_SHIFT_PP
+    packusdw    m3,         m4
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m4,         INTERP_SHIFT_PS
+    packssdw    m3,         m4
+%endif
+    movh        [r2],       m3
+    pextrd      [r2 + 8],   m3, 2
+%endmacro
+
+cglobal chroma_filter_pp_6x1_internal
+    FILTER_W6_1 pp
+    ret
+
+cglobal chroma_filter_ps_6x1_internal
+    FILTER_W6_1 ps
+    ret
+
+%macro FILTER_W8_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + 8],   m3
+%endmacro
+
+cglobal chroma_filter_pp_8x1_internal
+    FILTER_W8_1 pp
+    ret
+
+cglobal chroma_filter_ps_8x1_internal
+    FILTER_W8_1 ps
+    ret
+
+%macro FILTER_W12_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + 8],   m3
+
+    movu        m3,         [r0 + 16]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 20]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    packusdw    m3,         m3
+    CLIPW       m3,         m6, m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    packssdw    m3,         m3
+%endif
+    movh        [r2 + 16],  m3
+%endmacro
+
+cglobal chroma_filter_pp_12x1_internal
+    FILTER_W12_1 pp
+    ret
+
+cglobal chroma_filter_ps_12x1_internal
+    FILTER_W12_1 ps
+    ret
+
+%macro FILTER_W16_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + 8],   m3
+
+    movu        m3,         [r0 + 16]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 20]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 24]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 28]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 16],  m3
+    movhps      [r2 + 24],  m3
+%endmacro
+
+cglobal chroma_filter_pp_16x1_internal
+    FILTER_W16_1 pp
+    ret
+
+cglobal chroma_filter_ps_16x1_internal
+    FILTER_W16_1 ps
+    ret
+
+%macro FILTER_W24_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + 8],   m3
+
+    movu        m3,         [r0 + 16]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 20]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 24]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 28]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 16],  m3
+    movhps      [r2 + 24],  m3
+
+    movu        m3,         [r0 + 32]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 36]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 40]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 44]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 32],  m3
+    movhps      [r2 + 40],  m3
+%endmacro
+
+cglobal chroma_filter_pp_24x1_internal
+    FILTER_W24_1 pp
+    ret
+
+cglobal chroma_filter_ps_24x1_internal
+    FILTER_W24_1 ps
+    ret
+
+%macro FILTER_W32_1 1
+    movu        m3,         [r0]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2],       m3
+    movhps      [r2 + 8],   m3
+
+    movu        m3,         [r0 + 16]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 20]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 24]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 28]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 16],  m3
+    movhps      [r2 + 24],  m3
+
+    movu        m3,         [r0 + 32]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 36]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 40]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 44]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 32],  m3
+    movhps      [r2 + 40],  m3
+
+    movu        m3,         [r0 + 48]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + 52]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + 56]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + 60]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + 48],  m3
+    movhps      [r2 + 56],  m3
+%endmacro
+
+cglobal chroma_filter_pp_32x1_internal
+    FILTER_W32_1 pp
+    ret
+
+cglobal chroma_filter_ps_32x1_internal
+    FILTER_W32_1 ps
+    ret
+
+%macro FILTER_W8o_1 2
+    movu        m3,         [r0 + %2]
+    pshufb      m3,         m3, m2
+    pmaddwd     m3,         m0
+    movu        m4,         [r0 + %2 + 4]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m3,         m4
+    paddd       m3,         m1
+
+    movu        m5,         [r0 + %2 + 8]
+    pshufb      m5,         m5, m2
+    pmaddwd     m5,         m0
+    movu        m4,         [r0 + %2 + 12]
+    pshufb      m4,         m4, m2
+    pmaddwd     m4,         m0
+    phaddd      m5,         m4
+    paddd       m5,         m1
+%ifidn %1, pp
+    psrad       m3,         INTERP_SHIFT_PP
+    psrad       m5,         INTERP_SHIFT_PP
+    packusdw    m3,         m5
+    CLIPW       m3,         m6,    m7
+%else
+    psrad       m3,         INTERP_SHIFT_PS
+    psrad       m5,         INTERP_SHIFT_PS
+    packssdw    m3,         m5
+%endif
+    movh        [r2 + %2],       m3
+    movhps      [r2 + %2 + 8],   m3
+%endmacro
+
+%macro FILTER_W48_1 1
+    FILTER_W8o_1 %1, 0
+    FILTER_W8o_1 %1, 16
+    FILTER_W8o_1 %1, 32
+    FILTER_W8o_1 %1, 48
+    FILTER_W8o_1 %1, 64
+    FILTER_W8o_1 %1, 80
+%endmacro
+
+cglobal chroma_filter_pp_48x1_internal
+    FILTER_W48_1 pp
+    ret
+
+cglobal chroma_filter_ps_48x1_internal
+    FILTER_W48_1 ps
+    ret
+
+%macro FILTER_W64_1 1
+    FILTER_W8o_1 %1, 0
+    FILTER_W8o_1 %1, 16
+    FILTER_W8o_1 %1, 32
+    FILTER_W8o_1 %1, 48
+    FILTER_W8o_1 %1, 64
+    FILTER_W8o_1 %1, 80
+    FILTER_W8o_1 %1, 96
+    FILTER_W8o_1 %1, 112
+%endmacro
+
+cglobal chroma_filter_pp_64x1_internal
+    FILTER_W64_1 pp
+    ret
+
+cglobal chroma_filter_ps_64x1_internal
+    FILTER_W64_1 ps
+    ret
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+INIT_XMM sse4
+%macro IPFILTER_CHROMA 6
+cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6
+
+    add         r3,        r3
+    add         r1,        r1
+    sub         r0,         2
+    mov         r4d,        r4m
+    add         r4d,        r4d
+
+%ifdef PIC
+    lea         r%4,       [tab_ChromaCoeff]
+    movh        m0,       [r%4 + r4 * 4]
+%else
+    movh        m0,       [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    punpcklqdq  m0,       m0
+    mova        m2,       [tab_Tm16]
+
+%ifidn %3, ps
+    mova        m1,       [INTERP_OFFSET_PS]
+    cmp         r5m, byte 0
+    je          .skip
+    sub         r0, r1
+    call chroma_filter_%3_%1x1_internal
+    add         r0, r1
+    add         r2, r3
+    call chroma_filter_%3_%1x1_internal
+    add         r0, r1
+    add         r2, r3
+    call chroma_filter_%3_%1x1_internal
+    add         r0, r1
+    add         r2, r3
+.skip:
+%else
+    mova        m1,         [tab_c_32]
+    pxor        m6,         m6
+    mova        m7,         [pw_pixel_max]
+%endif
+
+    call chroma_filter_%3_%1x1_internal
+%rep %2 - 1
+    add         r0,       r1
+    add         r2,       r3
+    call chroma_filter_%3_%1x1_internal
+%endrep
+RET
+%endmacro
+IPFILTER_CHROMA 6, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 2, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 4, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 6, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 12, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 4, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 12, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 24, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 24, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 32, pp, 5, 6, 8
+
+IPFILTER_CHROMA 6, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 2, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 4, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 6, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 12, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 4, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 12, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 24, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
+
+IPFILTER_CHROMA 6, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 12, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 12, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 24, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 24, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 6, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 12, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 12, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 24, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 24, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 64, ps, 6, 7, 6
+
+IPFILTER_CHROMA 48, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 48, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 64, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 64, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 48, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 48, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 64, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 64, 16, ps, 6, 7, 6
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_6xN 1
+cglobal interp_4tap_horiz_pp_6x%1, 5,6,8
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, %1/2
+.loop:
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movq            [r2], xm3
+    pextrd          [r2 + 8], xm3, 2
+
+    vbroadcasti128  m3, [r0 + r1]
+    vbroadcasti128  m4, [r0 + r1 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movq            [r2 + r3], xm3
+    pextrd          [r2 + r3 + 8], xm3, 2
+
+    lea             r0, [r0 + r1 * 2]
+    lea             r2, [r2 + r3 * 2]
+    dec             r4d
+    jnz             .loop
+    RET
+%endmacro
+IPFILTER_CHROMA_avx2_6xN 8
+IPFILTER_CHROMA_avx2_6xN 16
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_8x2, 5,6,8
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, INTERP_SHIFT_PP          ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3,q2020
+    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2], xm3
+
+    vbroadcasti128  m3, [r0 + r1]
+    vbroadcasti128  m4, [r0 + r1 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, INTERP_SHIFT_PP           ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3,q2020
+    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2 + r3], xm3
+    RET
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal interp_4tap_horiz_pp_8x4, 5,6,8
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+%rep 2
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3,q2020
+    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2], xm3
+
+    vbroadcasti128  m3, [r0 + r1]
+    vbroadcasti128  m4, [r0 + r1 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3,q2020
+    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2 + r3], xm3
+
+    lea             r0, [r0 + r1 * 2]
+    lea             r2, [r2 + r3 * 2]
+%endrep
+    RET
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_8xN 1
+cglobal interp_4tap_horiz_pp_8x%1, 5,6,8
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, %1/2
+.loop:
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2], xm3
+
+    vbroadcasti128  m3, [r0 + r1]
+    vbroadcasti128  m4, [r0 + r1 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                         ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                      ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2 + r3], xm3
+
+    lea             r0, [r0 + r1 * 2]
+    lea             r2, [r2 + r3 * 2]
+    dec             r4d
+    jnz             .loop
+    RET
+%endmacro
+IPFILTER_CHROMA_avx2_8xN 6
+IPFILTER_CHROMA_avx2_8xN 8
+IPFILTER_CHROMA_avx2_8xN 12
+IPFILTER_CHROMA_avx2_8xN 16
+IPFILTER_CHROMA_avx2_8xN 32
+IPFILTER_CHROMA_avx2_8xN 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_16xN 1
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_16x%1, 5,6,9
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, %1
+.loop:
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
+
+    vbroadcasti128  m4, [r0 + 16]
+    vbroadcasti128  m8, [r0 + 24]
+
+    pshufb          m4, m1
+    pshufb          m8, m1
+
+    pmaddwd         m4, m0
+    pmaddwd         m8, m0
+    phaddd          m4, m8
+    paddd           m4, m2
+    psrad           m4, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m4, m4
+    vpermq          m4, m4, q2020
+    pshufb          xm4, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
+    vinserti128     m3, m3, xm4, 1
+    CLIPW           m3, m5, m7
+    movu            [r2], m3
+
+    add             r0, r1
+    add             r2, r3
+    dec             r4d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+IPFILTER_CHROMA_avx2_16xN 4
+IPFILTER_CHROMA_avx2_16xN 8
+IPFILTER_CHROMA_avx2_16xN 12
+IPFILTER_CHROMA_avx2_16xN 16
+IPFILTER_CHROMA_avx2_16xN 24
+IPFILTER_CHROMA_avx2_16xN 32
+IPFILTER_CHROMA_avx2_16xN 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_32xN 1
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_32x%1, 5,6,9
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r6d, %1
+.loop:
+%assign x 0
+%rep 2
+    vbroadcasti128  m3, [r0 + x]
+    vbroadcasti128  m4, [r0 + 8 + x]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
+
+    vbroadcasti128  m4, [r0 + 16 + x]
+    vbroadcasti128  m8, [r0 + 24 + x]
+    pshufb          m4, m1
+    pshufb          m8, m1
+
+    pmaddwd         m4, m0
+    pmaddwd         m8, m0
+    phaddd          m4, m8
+    paddd           m4, m2
+    psrad           m4, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m4, m4
+    vpermq          m4, m4, q2020
+    pshufb          xm4, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
+    vinserti128     m3, m3, xm4, 1
+    CLIPW           m3, m5, m7
+    movu            [r2 + x], m3
+    %assign x x+32
+    %endrep
+
+    add             r0, r1
+    add             r2, r3
+    dec             r6d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+IPFILTER_CHROMA_avx2_32xN 8
+IPFILTER_CHROMA_avx2_32xN 16
+IPFILTER_CHROMA_avx2_32xN 24
+IPFILTER_CHROMA_avx2_32xN 32
+IPFILTER_CHROMA_avx2_32xN 48
+IPFILTER_CHROMA_avx2_32xN 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_12xN 1
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_12x%1, 5,6,8
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, %1
+.loop:
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                     ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movu            [r2], xm3
+
+    vbroadcasti128  m3, [r0 + 16]
+    vbroadcasti128  m4, [r0 + 24]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6                       ; m3 = DWORD[7 6 3 2 5 4 1 0]
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6                    ; m3 = WORD[7 6 5 4 3 2 1 0]
+    CLIPW           xm3, xm5, xm7
+    movq            [r2 + 16], xm3
+
+    add             r0, r1
+    add             r2, r3
+    dec             r4d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+IPFILTER_CHROMA_avx2_12xN 16
+IPFILTER_CHROMA_avx2_12xN 32
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_24xN 1
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_24x%1, 5,6,9
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, %1
+.loop:
+    vbroadcasti128  m3, [r0]
+    vbroadcasti128  m4, [r0 + 8]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6
+
+    vbroadcasti128  m4, [r0 + 16]
+    vbroadcasti128  m8, [r0 + 24]
+    pshufb          m4, m1
+    pshufb          m8, m1
+
+    pmaddwd         m4, m0
+    pmaddwd         m8, m0
+    phaddd          m4, m8
+    paddd           m4, m2
+    psrad           m4, 6
+
+    packusdw        m3, m4
+    vpermq          m3, m3, q3120
+    pshufb          m3, m6
+    CLIPW           m3, m5, m7
+    movu            [r2], m3
+
+    vbroadcasti128  m3, [r0 + 32]
+    vbroadcasti128  m4, [r0 + 40]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6
+
+    packusdw        m3, m3
+    vpermq          m3, m3, q2020
+    pshufb          xm3, xm6
+    CLIPW           xm3, xm5, xm7
+    movu            [r2 + 32], xm3
+
+    add             r0, r1
+    add             r2, r3
+    dec             r4d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+IPFILTER_CHROMA_avx2_24xN 32
+IPFILTER_CHROMA_avx2_24xN 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%macro IPFILTER_CHROMA_avx2_64xN 1
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_64x%1, 5,6,9
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r6d, %1
+.loop:
+%assign x 0
+%rep 4
+    vbroadcasti128  m3, [r0 + x]
+    vbroadcasti128  m4, [r0 + 8 + x]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6
+
+    vbroadcasti128  m4, [r0 + 16 + x]
+    vbroadcasti128  m8, [r0 + 24 + x]
+    pshufb          m4, m1
+    pshufb          m8, m1
+
+    pmaddwd         m4, m0
+    pmaddwd         m8, m0
+    phaddd          m4, m8
+    paddd           m4, m2
+    psrad           m4, 6
+
+    packusdw        m3, m4
+    vpermq          m3, m3, q3120
+    pshufb          m3, m6
+    CLIPW           m3, m5, m7
+    movu            [r2 + x], m3
+    %assign x x+32
+    %endrep
+
+    add             r0, r1
+    add             r2, r3
+    dec             r6d
+    jnz             .loop
+    RET
+%endif
+%endmacro
+IPFILTER_CHROMA_avx2_64xN 16
+IPFILTER_CHROMA_avx2_64xN 32
+IPFILTER_CHROMA_avx2_64xN 48
+IPFILTER_CHROMA_avx2_64xN 64
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_pp(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal interp_4tap_horiz_pp_48x64, 5,6,9
+    add             r1d, r1d
+    add             r3d, r3d
+    sub             r0, 2
+    mov             r4d, r4m
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff]
+    vpbroadcastq    m0, [r5 + r4 * 8]
+%else
+    vpbroadcastq    m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova            m1, [h4_interp8_hpp_shuf]
+    vpbroadcastd    m2, [pd_32]
+    pxor            m5, m5
+    mova            m6, [idct8_shuf2]
+    mova            m7, [pw_pixel_max]
+
+    mov             r4d, 64
+.loop:
+%assign x 0
+%rep 3
+    vbroadcasti128  m3, [r0 + x]
+    vbroadcasti128  m4, [r0 + 8 + x]
+    pshufb          m3, m1
+    pshufb          m4, m1
+
+    pmaddwd         m3, m0
+    pmaddwd         m4, m0
+    phaddd          m3, m4
+    paddd           m3, m2
+    psrad           m3, 6
+
+    vbroadcasti128  m4, [r0 + 16 + x]
+    vbroadcasti128  m8, [r0 + 24 + x]
+    pshufb          m4, m1
+    pshufb          m8, m1
+
+    pmaddwd         m4, m0
+    pmaddwd         m8, m0
+    phaddd          m4, m8
+    paddd           m4, m2
+    psrad           m4, 6
+
+    packusdw        m3, m4
+    vpermq          m3, m3, q3120
+    pshufb          m3, m6
+    CLIPW           m3, m5, m7
+    movu            [r2 + x], m3
+%assign x x+32
+%endrep
+
+    add             r0, r1
+    add             r2, r3
+    dec             r4d
+    jnz             .loop
+    RET
+%endif
+
+%macro IPFILTER_CHROMA_PS_8xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_8x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+    IPFILTER_CHROMA_PS_8xN_AVX2 4
+    IPFILTER_CHROMA_PS_8xN_AVX2 8
+    IPFILTER_CHROMA_PS_8xN_AVX2 16
+    IPFILTER_CHROMA_PS_8xN_AVX2 32
+    IPFILTER_CHROMA_PS_8xN_AVX2 6
+    IPFILTER_CHROMA_PS_8xN_AVX2 2
+    IPFILTER_CHROMA_PS_8xN_AVX2 12
+    IPFILTER_CHROMA_PS_8xN_AVX2 64
+
+%macro IPFILTER_CHROMA_PS_16xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_16x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    vbroadcasti128      m5, [r0 + 24]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 16], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+IPFILTER_CHROMA_PS_16xN_AVX2 16
+IPFILTER_CHROMA_PS_16xN_AVX2 8
+IPFILTER_CHROMA_PS_16xN_AVX2 32
+IPFILTER_CHROMA_PS_16xN_AVX2 12
+IPFILTER_CHROMA_PS_16xN_AVX2 4
+IPFILTER_CHROMA_PS_16xN_AVX2 64
+IPFILTER_CHROMA_PS_16xN_AVX2 24
+
+%macro IPFILTER_CHROMA_PS_24xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_24x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    vbroadcasti128      m5, [r0 + 24]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 16], xm4
+
+    vbroadcasti128      m4, [r0 + 32]
+    vbroadcasti128      m5, [r0 + 40]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 32], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+IPFILTER_CHROMA_PS_24xN_AVX2 32
+IPFILTER_CHROMA_PS_24xN_AVX2 64
+
+%macro IPFILTER_CHROMA_PS_12xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_12x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    pshufb              m4, m3
+    pmaddwd             m4, m0
+    phaddd              m4, m4
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movq                [r2 + 16], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+IPFILTER_CHROMA_PS_12xN_AVX2 16
+IPFILTER_CHROMA_PS_12xN_AVX2 32
+
+%macro IPFILTER_CHROMA_PS_32xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_32x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    vbroadcasti128      m5, [r0 + 24]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 16], xm4
+
+    vbroadcasti128      m4, [r0 + 32]
+    vbroadcasti128      m5, [r0 + 40]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 32], xm4
+
+    vbroadcasti128      m4, [r0 + 48]
+    vbroadcasti128      m5, [r0 + 56]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 48], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+IPFILTER_CHROMA_PS_32xN_AVX2 32
+IPFILTER_CHROMA_PS_32xN_AVX2 16
+IPFILTER_CHROMA_PS_32xN_AVX2 24
+IPFILTER_CHROMA_PS_32xN_AVX2 8
+IPFILTER_CHROMA_PS_32xN_AVX2 64
+IPFILTER_CHROMA_PS_32xN_AVX2 48
+
+
+%macro IPFILTER_CHROMA_PS_64xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_64x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    vbroadcasti128      m5, [r0 + 24]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 16], xm4
+
+    vbroadcasti128      m4, [r0 + 32]
+    vbroadcasti128      m5, [r0 + 40]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 32], xm4
+
+    vbroadcasti128      m4, [r0 + 48]
+    vbroadcasti128      m5, [r0 + 56]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 48], xm4
+
+    vbroadcasti128      m4, [r0 + 64]
+    vbroadcasti128      m5, [r0 + 72]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 64], xm4
+
+    vbroadcasti128      m4, [r0 + 80]
+    vbroadcasti128      m5, [r0 + 88]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 80], xm4
+
+    vbroadcasti128      m4, [r0 + 96]
+    vbroadcasti128      m5, [r0 + 104]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 96], xm4
+
+    vbroadcasti128      m4, [r0 + 112]
+    vbroadcasti128      m5, [r0 + 120]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 112], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+IPFILTER_CHROMA_PS_64xN_AVX2 64
+IPFILTER_CHROMA_PS_64xN_AVX2 48
+IPFILTER_CHROMA_PS_64xN_AVX2 32
+IPFILTER_CHROMA_PS_64xN_AVX2 16
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_48x64, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, 64
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2], xm4
+
+    vbroadcasti128      m4, [r0 + 16]
+    vbroadcasti128      m5, [r0 + 24]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 16], xm4
+
+    vbroadcasti128      m4, [r0 + 32]
+    vbroadcasti128      m5, [r0 + 40]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 32], xm4
+
+    vbroadcasti128      m4, [r0 + 48]
+    vbroadcasti128      m5, [r0 + 56]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 48], xm4
+
+    vbroadcasti128      m4, [r0 + 64]
+    vbroadcasti128      m5, [r0 + 72]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 64], xm4
+
+    vbroadcasti128      m4, [r0 + 80]
+    vbroadcasti128      m5, [r0 + 88]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movu                [r2 + 80], xm4
+
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+
+%macro IPFILTER_CHROMA_PS_6xN_AVX2 1
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal interp_4tap_horiz_ps_6x%1, 4, 7, 6
+    add                 r1d, r1d
+    add                 r3d, r3d
+    mov                 r4d, r4m
+    mov                 r5d, r5m
+
+%ifdef PIC
+    lea                 r6, [tab_ChromaCoeff]
+    vpbroadcastq        m0, [r6 + r4 * 8]
+%else
+    vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
+%endif
+    mova                m3, [h4_interp8_hpp_shuf]
+    vbroadcasti128      m2, [INTERP_OFFSET_PS]
+
+    ; register map
+    ; m0 , m1 interpolate coeff
+
+    sub                 r0, 2
+    test                r5d, r5d
+    mov                 r4d, %1
+    jz                  .loop0
+    sub                 r0, r1
+    add                 r4d, 3
+
+.loop0:
+    vbroadcasti128      m4, [r0]
+    vbroadcasti128      m5, [r0 + 8]
+    pshufb              m4, m3
+    pshufb              m5, m3
+    pmaddwd             m4, m0
+    pmaddwd             m5, m0
+    phaddd              m4, m5
+    paddd               m4, m2
+    vpermq              m4, m4, q3120
+    psrad               m4, INTERP_SHIFT_PS
+    vextracti128        xm5, m4, 1
+    packssdw            xm4, xm5
+    movq                [r2], xm4
+    pextrd              [r2 + 8], xm4, 2
+    add                 r2, r3
+    add                 r0, r1
+    dec                 r4d
+    jnz                 .loop0
+    RET
+%endif
+%endmacro
+
+    IPFILTER_CHROMA_PS_6xN_AVX2 8
+    IPFILTER_CHROMA_PS_6xN_AVX2 16


More information about the x265-devel mailing list