[x265] [PATCH] luma_hpp[8x8, 8x16, 8x32] avx2 asm code: improve 657c->567c, 1192c->1074c, 2602c->2113c
divya at multicorewareinc.com
divya at multicorewareinc.com
Tue Nov 11 09:27:19 CET 2014
# HG changeset patch
# User Divya Manivannan
# Date 1415694311 -19800
# Tue Nov 11 13:55:11 2014 +0530
# Node ID 6adafe6ef2868b28b74c66f1ef82cf3cec6bb2a7
# Parent a4c68926ff170d619c26bb78c7a988fa5ad715db
luma_hpp[8x8, 8x16, 8x32] avx2 asm code: improve 657c->567c, 1192c->1074c, 2602c->2113c
diff -r a4c68926ff17 -r 6adafe6ef286 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Nov 10 12:28:06 2014 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Nov 11 13:55:11 2014 +0530
@@ -1,862 +1,803 @@
-;*****************************************************************************
-;* Copyright (C) 2013 x265 project
-;*
-;* Authors: Min Chen <chenm003 at 163.com>
-;* Nabajit Deka <nabajit at multicorewareinc.com>
-;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
-;*
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at license @ x265.com.
-;*****************************************************************************/
-
-%include "x86inc.asm"
-%include "x86util.asm"
-
-SECTION_RODATA 32
-tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
- db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
- db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
-
-ALIGN 32
-tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
- db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
- db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
- db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
-
-tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
- db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
-
-tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
-
-tab_c_526336: times 4 dd 8192*64+2048
-
-tab_ChromaCoeff: db 0, 64, 0, 0
- db -2, 58, 10, -2
- db -4, 54, 16, -2
- db -6, 46, 28, -4
- db -4, 36, 36, -4
- db -4, 28, 46, -6
- db -2, 16, 54, -4
- db -2, 10, 58, -2
-
-tab_ChromaCoeffV: times 4 dw 0, 64
- times 4 dw 0, 0
-
- times 4 dw -2, 58
- times 4 dw 10, -2
-
- times 4 dw -4, 54
- times 4 dw 16, -2
-
- times 4 dw -6, 46
- times 4 dw 28, -4
-
- times 4 dw -4, 36
- times 4 dw 36, -4
-
- times 4 dw -4, 28
- times 4 dw 46, -6
-
- times 4 dw -2, 16
- times 4 dw 54, -4
-
- times 4 dw -2, 10
- times 4 dw 58, -2
-
-tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
- db -1, 4, -10, 58, 17, -5, 1, 0
- db -1, 4, -11, 40, 40, -11, 4, -1
- db 0, 1, -5, 17, 58, -10, 4, -1
-
-tab_LumaCoeffV: times 4 dw 0, 0
- times 4 dw 0, 64
- times 4 dw 0, 0
- times 4 dw 0, 0
-
- times 4 dw -1, 4
- times 4 dw -10, 58
- times 4 dw 17, -5
- times 4 dw 1, 0
-
- times 4 dw -1, 4
- times 4 dw -11, 40
- times 4 dw 40, -11
- times 4 dw 4, -1
-
- times 4 dw 0, 1
- times 4 dw -5, 17
- times 4 dw 58, -10
- times 4 dw 4, -1
-
-tab_LumaCoeffVer: times 8 db 0, 0
- times 8 db 0, 64
- times 8 db 0, 0
- times 8 db 0, 0
-
- times 8 db -1, 4
- times 8 db -10, 58
- times 8 db 17, -5
- times 8 db 1, 0
-
- times 8 db -1, 4
- times 8 db -11, 40
- times 8 db 40, -11
- times 8 db 4, -1
-
- times 8 db 0, 1
- times 8 db -5, 17
- times 8 db 58, -10
- times 8 db 4, -1
-
-tab_c_64_n64: times 8 db 64, -64
-
-shuf1: times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
-
-SECTION .text
-
-cextern idct4_shuf1
-cextern pb_128
-cextern pw_1
-cextern pw_512
-cextern pw_2000
-
-%macro FILTER_H4_w2_2 3
- movh %2, [srcq - 1]
- pshufb %2, %2, Tm0
- movh %1, [srcq + srcstrideq - 1]
- pshufb %1, %1, Tm0
- punpcklqdq %2, %1
- pmaddubsw %2, coef2
- phaddw %2, %2
- pmulhrsw %2, %3
- packuswb %2, %2
- movd r4, %2
- mov [dstq], r4w
- shr r4, 16
- mov [dstq + dststrideq], r4w
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-%rep 2
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-%endrep
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-%rep 4
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-%endrep
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-mov r5d, 16/2
-
-.loop:
-FILTER_H4_w2_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-dec r5d
-jnz .loop
-
-RET
-
-%macro FILTER_H4_w4_2 3
- movh %2, [srcq - 1]
- pshufb %2, %2, Tm0
- pmaddubsw %2, coef2
- movh %1, [srcq + srcstrideq - 1]
- pshufb %1, %1, Tm0
- pmaddubsw %1, coef2
- phaddw %2, %1
- pmulhrsw %2, %3
- packuswb %2, %2
- movd [dstq], %2
- palignr %2, %2, 4
- movd [dstq + dststrideq], %2
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-FILTER_H4_w4_2 t0, t1, t2
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-%rep 2
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-%endrep
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-%rep 4
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-%endrep
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-%rep 8
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-%endrep
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
-%define coef2 m4
-%define Tm0 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-
-mov r5d, 32/2
-
-.loop:
-FILTER_H4_w4_2 t0, t1, t2
-lea srcq, [srcq + srcstrideq * 2]
-lea dstq, [dstq + dststrideq * 2]
-dec r5d
-jnz .loop
-
-RET
-
-
-%macro FILTER_H4_w6 3
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- pmulhrsw %2, %3
- packuswb %2, %2
- movd [dstq], %2
- pextrw [dstq + 4], %2, 2
-%endmacro
-
-%macro FILTER_H4_w8 3
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- pmulhrsw %2, %3
- packuswb %2, %2
- movh [dstq], %2
-%endmacro
-
-%macro FILTER_H4_w12 3
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- pmulhrsw %2, %3
- movu %1, [srcq - 1 + 8]
- pshufb %1, %1, Tm0
- pmaddubsw %1, coef2
- phaddw %1, %1
- pmulhrsw %1, %3
- packuswb %2, %1
- movh [dstq], %2
- pextrd [dstq + 8], %2, 2
-%endmacro
-
-%macro FILTER_H4_w16 4
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq - 1 + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- pmulhrsw %2, %3
- pmulhrsw %4, %3
- packuswb %2, %4
- movu [dstq], %2
-%endmacro
-
-%macro FILTER_H4_w24 4
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq - 1 + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- pmulhrsw %2, %3
- pmulhrsw %4, %3
- packuswb %2, %4
- movu [dstq], %2
- movu %1, [srcq - 1 + 16]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- pmulhrsw %2, %3
- packuswb %2, %2
- movh [dstq + 16], %2
-%endmacro
-
-%macro FILTER_H4_w32 4
- movu %1, [srcq - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq - 1 + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- pmulhrsw %2, %3
- pmulhrsw %4, %3
- packuswb %2, %4
- movu [dstq], %2
- movu %1, [srcq - 1 + 16]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq - 1 + 24]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- pmulhrsw %2, %3
- pmulhrsw %4, %3
- packuswb %2, %4
- movu [dstq + 16], %2
-%endmacro
-
-%macro FILTER_H4_w16o 5
- movu %1, [srcq + %5 - 1]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + %5 - 1 + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- pmulhrsw %2, %3
- pmulhrsw %4, %3
- packuswb %2, %4
- movu [dstq + %5], %2
-%endmacro
-
-%macro FILTER_H4_w48 4
- FILTER_H4_w16o %1, %2, %3, %4, 0
- FILTER_H4_w16o %1, %2, %3, %4, 16
- FILTER_H4_w16o %1, %2, %3, %4, 32
-%endmacro
-
-%macro FILTER_H4_w64 4
- FILTER_H4_w16o %1, %2, %3, %4, 0
- FILTER_H4_w16o %1, %2, %3, %4, 16
- FILTER_H4_w16o %1, %2, %3, %4, 32
- FILTER_H4_w16o %1, %2, %3, %4, 48
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro IPFILTER_CHROMA 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
-%define coef2 m5
-%define Tm0 m4
-%define Tm1 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-mov r5d, %2
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-mova Tm1, [tab_Tm + 16]
-
-.loop:
-FILTER_H4_w%1 t0, t1, t2
-add srcq, srcstrideq
-add dstq, dststrideq
-
-dec r5d
-jnz .loop
-
-RET
-%endmacro
-
-
-IPFILTER_CHROMA 6, 8
-IPFILTER_CHROMA 8, 2
-IPFILTER_CHROMA 8, 4
-IPFILTER_CHROMA 8, 6
-IPFILTER_CHROMA 8, 8
-IPFILTER_CHROMA 8, 16
-IPFILTER_CHROMA 8, 32
-IPFILTER_CHROMA 12, 16
-
-IPFILTER_CHROMA 6, 16
-IPFILTER_CHROMA 8, 12
-IPFILTER_CHROMA 8, 64
-IPFILTER_CHROMA 12, 32
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro IPFILTER_CHROMA_W 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
-%define coef2 m6
-%define Tm0 m5
-%define Tm1 m4
-%define t3 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
-mov r4d, r4m
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd coef2, [r5 + r4 * 4]
-%else
-movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-mov r5d, %2
-
-pshufd coef2, coef2, 0
-mova t2, [pw_512]
-mova Tm0, [tab_Tm]
-mova Tm1, [tab_Tm + 16]
-
-.loop:
-FILTER_H4_w%1 t0, t1, t2, t3
-add srcq, srcstrideq
-add dstq, dststrideq
-
-dec r5d
-jnz .loop
-
-RET
-%endmacro
-
-IPFILTER_CHROMA_W 16, 4
-IPFILTER_CHROMA_W 16, 8
-IPFILTER_CHROMA_W 16, 12
-IPFILTER_CHROMA_W 16, 16
-IPFILTER_CHROMA_W 16, 32
-IPFILTER_CHROMA_W 32, 8
-IPFILTER_CHROMA_W 32, 16
-IPFILTER_CHROMA_W 32, 24
-IPFILTER_CHROMA_W 24, 32
-IPFILTER_CHROMA_W 32, 32
-
-IPFILTER_CHROMA_W 16, 24
-IPFILTER_CHROMA_W 16, 64
-IPFILTER_CHROMA_W 32, 48
-IPFILTER_CHROMA_W 24, 64
-IPFILTER_CHROMA_W 32, 64
-
-IPFILTER_CHROMA_W 64, 64
-IPFILTER_CHROMA_W 64, 32
-IPFILTER_CHROMA_W 64, 48
-IPFILTER_CHROMA_W 48, 64
-IPFILTER_CHROMA_W 64, 16
-
-
-%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
- movu %1, %7
- pshufb %2, %1, [tab_Lm + 0]
- pmaddubsw %2, %5
- pshufb %3, %1, [tab_Lm + 16]
- pmaddubsw %3, %5
- phaddw %2, %3
- pshufb %4, %1, [tab_Lm + 32]
- pmaddubsw %4, %5
- pshufb %1, %1, [tab_Lm + 48]
- pmaddubsw %1, %5
- phaddw %4, %1
- phaddw %2, %4
- %if %0 == 8
- pmulhrsw %2, %6
- packuswb %2, %2
- movh %8, %2
- %endif
-%endmacro
-
-%macro FILTER_H8_W4 2
- movu %1, [r0 - 3 + r5]
- pshufb %2, %1, [tab_Lm]
- pmaddubsw %2, m3
- pshufb m7, %1, [tab_Lm + 16]
- pmaddubsw m7, m3
- phaddw %2, m7
- phaddw %2, %2
-%endmacro
-
-;----------------------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;----------------------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA 3
-INIT_XMM sse4
-cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
-
- mov r4d, r4m
-
-%ifdef PIC
- lea r6, [tab_LumaCoeff]
- movh m3, [r6 + r4 * 8]
-%else
- movh m3, [tab_LumaCoeff + r4 * 8]
-%endif
- punpcklqdq m3, m3
-
-%ifidn %3, pp
- mova m2, [pw_512]
-%else
- mova m2, [pw_2000]
-%endif
-
- mov r4d, %2
-%ifidn %3, ps
- add r3, r3
- cmp r5m, byte 0
- je .loopH
- lea r6, [r1 + 2 * r1]
- sub r0, r6
- add r4d, 7
-%endif
-
-.loopH:
- xor r5, r5
-%rep %1 / 8
- %ifidn %3, pp
- FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
- %else
- FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
- psubw m1, m2
- movu [r2 + 2 * r5], m1
- %endif
- add r5, 8
-%endrep
-
-%rep (%1 % 8) / 4
- FILTER_H8_W4 m0, m1
- %ifidn %3, pp
- pmulhrsw m1, m2
- packuswb m1, m1
- movd [r2 + r5], m1
- %else
- psubw m1, m2
- movh [r2 + 2 * r5], m1
- %endif
-%endrep
-
- add r0, r1
- add r2, r3
-
- dec r4d
- jnz .loopH
- RET
-%endmacro
-
-
-INIT_YMM avx2
-cglobal interp_8tap_horiz_pp_4x4, 4,6,6
- mov r4d, r4m
-
-%ifdef PIC
- lea r5, [tab_LumaCoeff]
- vpbroadcastq m0, [r5 + r4 * 8]
-%else
- vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
-%endif
-
- mova m1, [tab_Lm]
- vpbroadcastd m2, [pw_1]
-
- ; register map
- ; m0 - interpolate coeff
- ; m1 - shuffle order table
- ; m2 - constant word 1
-
- sub r0, 3
- ; Row 0-1
- vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m3, m1
- pmaddubsw m3, m0
- pmaddwd m3, m2
- vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m1
- pmaddubsw m4, m0
- pmaddwd m4, m2
- phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
-
- ; Row 2-3
- lea r0, [r0 + r1 * 2]
- vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m1
- pmaddubsw m4, m0
- pmaddwd m4, m2
- vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
- pshufb m5, m1
- pmaddubsw m5, m0
- pmaddwd m5, m2
- phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
-
- packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
- pmulhrsw m3, [pw_512]
- vextracti128 xm4, m3, 1
- packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
- pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
-
- lea r0, [r3 * 3]
- movd [r2], xm3
- pextrd [r2+r3], xm3, 2
- pextrd [r2+r3*2], xm3, 1
- pextrd [r2+r0], xm3, 3
- RET
-
-%macro IPFILTER_LUMA_AVX2 2
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Min Chen <chenm003 at 163.com>
+;* Nabajit Deka <nabajit at multicorewareinc.com>
+;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+ db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14
+
+ALIGN 32
+tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8
+ db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10
+ db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
+ db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
+
+tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+ db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3
+
+tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3
+
+tab_c_526336: times 4 dd 8192*64+2048
+
+tab_ChromaCoeff: db 0, 64, 0, 0
+ db -2, 58, 10, -2
+ db -4, 54, 16, -2
+ db -6, 46, 28, -4
+ db -4, 36, 36, -4
+ db -4, 28, 46, -6
+ db -2, 16, 54, -4
+ db -2, 10, 58, -2
+
+tab_ChromaCoeffV: times 4 dw 0, 64
+ times 4 dw 0, 0
+
+ times 4 dw -2, 58
+ times 4 dw 10, -2
+
+ times 4 dw -4, 54
+ times 4 dw 16, -2
+
+ times 4 dw -6, 46
+ times 4 dw 28, -4
+
+ times 4 dw -4, 36
+ times 4 dw 36, -4
+
+ times 4 dw -4, 28
+ times 4 dw 46, -6
+
+ times 4 dw -2, 16
+ times 4 dw 54, -4
+
+ times 4 dw -2, 10
+ times 4 dw 58, -2
+
+tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0
+ db -1, 4, -10, 58, 17, -5, 1, 0
+ db -1, 4, -11, 40, 40, -11, 4, -1
+ db 0, 1, -5, 17, 58, -10, 4, -1
+
+tab_LumaCoeffV: times 4 dw 0, 0
+ times 4 dw 0, 64
+ times 4 dw 0, 0
+ times 4 dw 0, 0
+
+ times 4 dw -1, 4
+ times 4 dw -10, 58
+ times 4 dw 17, -5
+ times 4 dw 1, 0
+
+ times 4 dw -1, 4
+ times 4 dw -11, 40
+ times 4 dw 40, -11
+ times 4 dw 4, -1
+
+ times 4 dw 0, 1
+ times 4 dw -5, 17
+ times 4 dw 58, -10
+ times 4 dw 4, -1
+
+tab_LumaCoeffVer: times 8 db 0, 0
+ times 8 db 0, 64
+ times 8 db 0, 0
+ times 8 db 0, 0
+
+ times 8 db -1, 4
+ times 8 db -10, 58
+ times 8 db 17, -5
+ times 8 db 1, 0
+
+ times 8 db -1, 4
+ times 8 db -11, 40
+ times 8 db 40, -11
+ times 8 db 4, -1
+
+ times 8 db 0, 1
+ times 8 db -5, 17
+ times 8 db 58, -10
+ times 8 db 4, -1
+
+tab_c_64_n64: times 8 db 64, -64
+
+SECTION .text
+
+cextern idct4_shuf1
+cextern pb_128
+cextern pw_1
+cextern pw_512
+cextern pw_2000
+
+%macro FILTER_H4_w2_2 3
+ movh %2, [srcq - 1]
+ pshufb %2, %2, Tm0
+ movh %1, [srcq + srcstrideq - 1]
+ pshufb %1, %1, Tm0
+ punpcklqdq %2, %1
+ pmaddubsw %2, coef2
+ phaddw %2, %2
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movd r4, %2
+ mov [dstq], r4w
+ shr r4, 16
+ mov [dstq + dststrideq], r4w
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+%rep 2
+FILTER_H4_w2_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+%rep 4
+FILTER_H4_w2_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+mov r5d, 16/2
+
+.loop:
+FILTER_H4_w2_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+dec r5d
+jnz .loop
+
+RET
+
+%macro FILTER_H4_w4_2 3
+ movh %2, [srcq - 1]
+ pshufb %2, %2, Tm0
+ pmaddubsw %2, coef2
+ movh %1, [srcq + srcstrideq - 1]
+ pshufb %1, %1, Tm0
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movd [dstq], %2
+ palignr %2, %2, 4
+ movd [dstq + dststrideq], %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+FILTER_H4_w4_2 t0, t1, t2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+%rep 2
+FILTER_H4_w4_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+%rep 4
+FILTER_H4_w4_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+%rep 8
+FILTER_H4_w4_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+%endrep
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride
+%define coef2 m4
+%define Tm0 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+
+mov r5d, 32/2
+
+.loop:
+FILTER_H4_w4_2 t0, t1, t2
+lea srcq, [srcq + srcstrideq * 2]
+lea dstq, [dstq + dststrideq * 2]
+dec r5d
+jnz .loop
+
+RET
+
+
+%macro FILTER_H4_w6 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movd [dstq], %2
+ pextrw [dstq + 4], %2, 2
+%endmacro
+
+%macro FILTER_H4_w8 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movh [dstq], %2
+%endmacro
+
+%macro FILTER_H4_w12 3
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ movu %1, [srcq - 1 + 8]
+ pshufb %1, %1, Tm0
+ pmaddubsw %1, coef2
+ phaddw %1, %1
+ pmulhrsw %1, %3
+ packuswb %2, %1
+ movh [dstq], %2
+ pextrd [dstq + 8], %2, 2
+%endmacro
+
+%macro FILTER_H4_w16 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], %2
+%endmacro
+
+%macro FILTER_H4_w24 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], %2
+ movu %1, [srcq - 1 + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+ movh [dstq + 16], %2
+%endmacro
+
+%macro FILTER_H4_w32 4
+ movu %1, [srcq - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq], %2
+ movu %1, [srcq - 1 + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq - 1 + 24]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq + 16], %2
+%endmacro
+
+%macro FILTER_H4_w16o 5
+ movu %1, [srcq + %5 - 1]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + %5 - 1 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ pmulhrsw %2, %3
+ pmulhrsw %4, %3
+ packuswb %2, %4
+ movu [dstq + %5], %2
+%endmacro
+
+%macro FILTER_H4_w48 4
+ FILTER_H4_w16o %1, %2, %3, %4, 0
+ FILTER_H4_w16o %1, %2, %3, %4, 16
+ FILTER_H4_w16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro FILTER_H4_w64 4
+ FILTER_H4_w16o %1, %2, %3, %4, 0
+ FILTER_H4_w16o %1, %2, %3, %4, 16
+ FILTER_H4_w16o %1, %2, %3, %4, 32
+ FILTER_H4_w16o %1, %2, %3, %4, 48
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
+%define coef2 m5
+%define Tm0 m4
+%define Tm1 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+mov r5d, %2
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+mova Tm1, [tab_Tm + 16]
+
+.loop:
+FILTER_H4_w%1 t0, t1, t2
+add srcq, srcstrideq
+add dstq, dststrideq
+
+dec r5d
+jnz .loop
+
+RET
+%endmacro
+
+
+IPFILTER_CHROMA 6, 8
+IPFILTER_CHROMA 8, 2
+IPFILTER_CHROMA 8, 4
+IPFILTER_CHROMA 8, 6
+IPFILTER_CHROMA 8, 8
+IPFILTER_CHROMA 8, 16
+IPFILTER_CHROMA 8, 32
+IPFILTER_CHROMA 12, 16
+
+IPFILTER_CHROMA 6, 16
+IPFILTER_CHROMA 8, 12
+IPFILTER_CHROMA 8, 64
+IPFILTER_CHROMA 12, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_W 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
+%define coef2 m6
+%define Tm0 m5
+%define Tm1 m4
+%define t3 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+mov r4d, r4m
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd coef2, [r5 + r4 * 4]
+%else
+movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+mov r5d, %2
+
+pshufd coef2, coef2, 0
+mova t2, [pw_512]
+mova Tm0, [tab_Tm]
+mova Tm1, [tab_Tm + 16]
+
+.loop:
+FILTER_H4_w%1 t0, t1, t2, t3
+add srcq, srcstrideq
+add dstq, dststrideq
+
+dec r5d
+jnz .loop
+
+RET
+%endmacro
+
+IPFILTER_CHROMA_W 16, 4
+IPFILTER_CHROMA_W 16, 8
+IPFILTER_CHROMA_W 16, 12
+IPFILTER_CHROMA_W 16, 16
+IPFILTER_CHROMA_W 16, 32
+IPFILTER_CHROMA_W 32, 8
+IPFILTER_CHROMA_W 32, 16
+IPFILTER_CHROMA_W 32, 24
+IPFILTER_CHROMA_W 24, 32
+IPFILTER_CHROMA_W 32, 32
+
+IPFILTER_CHROMA_W 16, 24
+IPFILTER_CHROMA_W 16, 64
+IPFILTER_CHROMA_W 32, 48
+IPFILTER_CHROMA_W 24, 64
+IPFILTER_CHROMA_W 32, 64
+
+IPFILTER_CHROMA_W 64, 64
+IPFILTER_CHROMA_W 64, 32
+IPFILTER_CHROMA_W 64, 48
+IPFILTER_CHROMA_W 48, 64
+IPFILTER_CHROMA_W 64, 16
+
+
+%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst
+ movu %1, %7
+ pshufb %2, %1, [tab_Lm + 0]
+ pmaddubsw %2, %5
+ pshufb %3, %1, [tab_Lm + 16]
+ pmaddubsw %3, %5
+ phaddw %2, %3
+ pshufb %4, %1, [tab_Lm + 32]
+ pmaddubsw %4, %5
+ pshufb %1, %1, [tab_Lm + 48]
+ pmaddubsw %1, %5
+ phaddw %4, %1
+ phaddw %2, %4
+ %if %0 == 8
+ pmulhrsw %2, %6
+ packuswb %2, %2
+ movh %8, %2
+ %endif
+%endmacro
+
+%macro FILTER_H8_W4 2
+ movu %1, [r0 - 3 + r5]
+ pshufb %2, %1, [tab_Lm]
+ pmaddubsw %2, m3
+ pshufb m7, %1, [tab_Lm + 16]
+ pmaddubsw m7, m3
+ phaddw %2, m7
+ phaddw %2, %2
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
+
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ movh m3, [r6 + r4 * 8]
+%else
+ movh m3, [tab_LumaCoeff + r4 * 8]
+%endif
+ punpcklqdq m3, m3
+
+%ifidn %3, pp
+ mova m2, [pw_512]
+%else
+ mova m2, [pw_2000]
+%endif
+
+ mov r4d, %2
+%ifidn %3, ps
+ add r3, r3
+ cmp r5m, byte 0
+ je .loopH
+ lea r6, [r1 + 2 * r1]
+ sub r0, r6
+ add r4d, 7
+%endif
+
+.loopH:
+ xor r5, r5
+%rep %1 / 8
+ %ifidn %3, pp
+ FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
+ %else
+ FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
+ psubw m1, m2
+ movu [r2 + 2 * r5], m1
+ %endif
+ add r5, 8
+%endrep
+
+%rep (%1 % 8) / 4
+ FILTER_H8_W4 m0, m1
+ %ifidn %3, pp
+ pmulhrsw m1, m2
+ packuswb m1, m1
+ movd [r2 + r5], m1
+ %else
+ psubw m1, m2
+ movh [r2 + 2 * r5], m1
+ %endif
+%endrep
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+%endmacro
+
+
INIT_YMM avx2
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,6
+cglobal interp_8tap_horiz_pp_4x4, 4,6,6
mov r4d, r4m
%ifdef PIC
@@ -867,6 +808,63 @@
%endif
mova m1, [tab_Lm]
+ vpbroadcastd m2, [pw_1]
+
+ ; register map
+ ; m0 - interpolate coeff
+ ; m1 - shuffle order table
+ ; m2 - constant word 1
+
+ sub r0, 3
+ ; Row 0-1
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m3, m1
+ pmaddubsw m3, m0
+ pmaddwd m3, m2
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]
+
+ ; Row 2-3
+ lea r0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddwd m4, m2
+ vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddwd m5, m2
+ phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
+
+ packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
+ pmulhrsw m3, [pw_512]
+ vextracti128 xm4, m3, 1
+ packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]
+ pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]
+
+ lea r0, [r3 * 3]
+ movd [r2], xm3
+ pextrd [r2+r3], xm3, 2
+ pextrd [r2+r3*2], xm3, 1
+ pextrd [r2+r0], xm3, 3
+ RET
+
+%macro IPFILTER_LUMA_AVX2 2
+INIT_YMM avx2
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastq m0, [r5 + r4 * 8]
+%else
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]
+%endif
+
+ mova m1, [tab_Lm]
mova m2, [tab_Lm + 32]
; register map
@@ -874,7 +872,7 @@
; m1, m2 - shuffle order table
sub r0, 3
- mov r4, %2 / 2
+ mov r4d, %2 / 4
.loop:
; Row 0
vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
@@ -893,4844 +891,4870 @@
phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]
pmulhrsw m3, [pw_512]
+
+ ; Row 2
+ lea r0, [r0 + r1 * 2]
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m5, m4, m2
+ pshufb m4, m1
+ pmaddubsw m4, m0
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ ; Row 3
+ vbroadcasti128 m5, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m6, m5, m2
+ pshufb m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ phaddw m5, m6
+
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]
+ pmulhrsw m4, [pw_512]
+
+ packuswb m3, m4
vextracti128 xm4, m3, 1
- packuswb xm3, xm4
- pshufb xm3, [shuf1]
-
- movq [r2], xm3
- movhps [r2 + r3], xm3
-
+ punpcklwd xm5, xm3, xm4
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
+ punpckhwd xm5, xm3, xm4
lea r2, [r2 + r3 * 2]
+
+ movq [r2], xm5
+ movhps [r2 + r3], xm5
+
lea r0, [r0 + r1 * 2]
- dec r4
+ lea r2, [r2 + r3 * 2]
+ dec r4d
jnz .loop
RET
%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
- IPFILTER_LUMA 4, 4, pp
- IPFILTER_LUMA 4, 8, pp
- IPFILTER_LUMA 12, 16, pp
- IPFILTER_LUMA 4, 16, pp
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA 4, 4, pp
+ IPFILTER_LUMA 4, 8, pp
+ IPFILTER_LUMA 12, 16, pp
+ IPFILTER_LUMA 4, 16, pp
IPFILTER_LUMA_AVX2 8, 8
IPFILTER_LUMA_AVX2 8, 16
IPFILTER_LUMA_AVX2 8, 32
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA_PP_W8 2
-INIT_XMM sse4
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
- mov r4d, r4m
-
-%ifdef PIC
- lea r5, [tab_LumaCoeff]
- movh m3, [r5 + r4 * 8]
-%else
- movh m3, [tab_LumaCoeff + r4 * 8]
-%endif
- pshufd m0, m3, 0 ; m0 = coeff-L
- pshufd m1, m3, 0x55 ; m1 = coeff-H
- lea r5, [tab_Tm] ; r5 = shuffle
- mova m2, [pw_512] ; m2 = 512
-
- mov r4d, %2
-.loopH:
-%assign x 0
-%rep %1 / 8
- movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
- pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
- pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
- pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
- pmaddubsw m4, m0
- pmaddubsw m6, m5, m1
- pmaddubsw m5, m0
- pmaddubsw m3, m1
- paddw m4, m6
- paddw m5, m3
- phaddw m4, m5
- pmulhrsw m4, m2
- packuswb m4, m4
- movh [r2 + x], m4
-%assign x x+8
-%endrep
-
- add r0, r1
- add r2, r3
-
- dec r4d
- jnz .loopH
- RET
-%endmacro
-
-IPFILTER_LUMA_PP_W8 8, 4
-IPFILTER_LUMA_PP_W8 8, 8
-IPFILTER_LUMA_PP_W8 8, 16
-IPFILTER_LUMA_PP_W8 8, 32
-IPFILTER_LUMA_PP_W8 16, 4
-IPFILTER_LUMA_PP_W8 16, 8
-IPFILTER_LUMA_PP_W8 16, 12
-IPFILTER_LUMA_PP_W8 16, 16
-IPFILTER_LUMA_PP_W8 16, 32
-IPFILTER_LUMA_PP_W8 16, 64
-IPFILTER_LUMA_PP_W8 24, 32
-IPFILTER_LUMA_PP_W8 32, 8
-IPFILTER_LUMA_PP_W8 32, 16
-IPFILTER_LUMA_PP_W8 32, 24
-IPFILTER_LUMA_PP_W8 32, 32
-IPFILTER_LUMA_PP_W8 32, 64
-IPFILTER_LUMA_PP_W8 48, 64
-IPFILTER_LUMA_PP_W8 64, 16
-IPFILTER_LUMA_PP_W8 64, 32
-IPFILTER_LUMA_PP_W8 64, 48
-IPFILTER_LUMA_PP_W8 64, 64
-
-;----------------------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;----------------------------------------------------------------------------------------------------------------------------
- IPFILTER_LUMA 4, 4, ps
- IPFILTER_LUMA 8, 8, ps
- IPFILTER_LUMA 8, 4, ps
- IPFILTER_LUMA 4, 8, ps
- IPFILTER_LUMA 16, 16, ps
- IPFILTER_LUMA 16, 8, ps
- IPFILTER_LUMA 8, 16, ps
- IPFILTER_LUMA 16, 12, ps
- IPFILTER_LUMA 12, 16, ps
- IPFILTER_LUMA 16, 4, ps
- IPFILTER_LUMA 4, 16, ps
- IPFILTER_LUMA 32, 32, ps
- IPFILTER_LUMA 32, 16, ps
- IPFILTER_LUMA 16, 32, ps
- IPFILTER_LUMA 32, 24, ps
- IPFILTER_LUMA 24, 32, ps
- IPFILTER_LUMA 32, 8, ps
- IPFILTER_LUMA 8, 32, ps
- IPFILTER_LUMA 64, 64, ps
- IPFILTER_LUMA 64, 32, ps
- IPFILTER_LUMA 32, 64, ps
- IPFILTER_LUMA 64, 48, ps
- IPFILTER_LUMA 48, 64, ps
- IPFILTER_LUMA 64, 16, ps
- IPFILTER_LUMA 16, 64, ps
-
-;-----------------------------------------------------------------------------
-; Interpolate HV
-;-----------------------------------------------------------------------------
-%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
- mova %5, [r0 + (%6 + 0) * 16]
- mova %1, [r0 + (%6 + 1) * 16]
- mova %2, [r0 + (%6 + 2) * 16]
- punpcklwd %3, %5, %1
- punpckhwd %5, %1
- pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
- pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
- punpcklwd %4, %1, %2
- punpckhwd %1, %2
- pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
- pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
-%endmacro ; FILTER_HV8_START
-
-%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
- mova %8, [r0 + (%9 + 0) * 16]
- mova %1, [r0 + (%9 + 1) * 16]
- punpcklwd %7, %2, %8
- punpckhwd %2, %8
- pmaddwd %7, [r5 + %10 * 16]
- pmaddwd %2, [r5 + %10 * 16]
- paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
- paddd %5, %2 ; R0 = H[0+1+2+3]
- punpcklwd %7, %8, %1
- punpckhwd %8, %1
- pmaddwd %7, [r5 + %10 * 16]
- pmaddwd %8, [r5 + %10 * 16]
- paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
- paddd %6, %8 ; R1 = H[1+2+3+4]
-%endmacro ; FILTER_HV8_MID
-
-; Round and Saturate
-%macro FILTER_HV8_END 4 ; output in [1, 3]
- paddd %1, [tab_c_526336]
- paddd %2, [tab_c_526336]
- paddd %3, [tab_c_526336]
- paddd %4, [tab_c_526336]
- psrad %1, 12
- psrad %2, 12
- psrad %3, 12
- psrad %4, 12
- packssdw %1, %2
- packssdw %3, %4
-
- ; TODO: is merge better? I think this way is short dependency link
- packuswb %1, %3
-%endmacro ; FILTER_HV8_END
-
-;-----------------------------------------------------------------------------
-; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
-%define coef m7
-%define stk_buf rsp
-
- mov r4d, r4m
- mov r5d, r5m
-
-%ifdef PIC
- lea r6, [tab_LumaCoeff]
- movh coef, [r6 + r4 * 8]
-%else
- movh coef, [tab_LumaCoeff + r4 * 8]
-%endif
- punpcklqdq coef, coef
-
- ; move to row -3
- lea r6, [r1 + r1 * 2]
- sub r0, r6
-
- xor r6, r6
- mov r4, rsp
-
-.loopH:
- FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
- psubw m1, [pw_2000]
- mova [r4], m1
-
- add r0, r1
- add r4, 16
- inc r6
- cmp r6, 8+7
- jnz .loopH
-
- ; ready to phase V
- ; Here all of mN is free
-
- ; load coeff table
- shl r5, 6
- lea r6, [tab_LumaCoeffV]
- lea r5, [r5 + r6]
-
- ; load intermedia buffer
- mov r0, stk_buf
-
- ; register mapping
- ; r0 - src
- ; r5 - coeff
- ; r6 - loop_i
-
- ; let's go
- xor r6, r6
-
- ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
-.loopV:
-
- FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
- FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
- FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
- FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
- FILTER_HV8_END m3, m0, m4, m1
-
- movh [r2], m3
- movhps [r2 + r3], m3
-
- lea r0, [r0 + 16 * 2]
- lea r2, [r2 + r3 * 2]
-
- inc r6
- cmp r6, 8/2
- jnz .loopV
-
- RET
-
-;-----------------------------------------------------------------------------
-;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-lea r4, [r1 * 3]
-lea r5, [r0 + 4 * r1]
-pshufb m0, [tab_Cm]
-mova m1, [pw_512]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r4]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-movd m6, [r5]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-lea r2, [r2 + 2 * r3]
-pextrw [r2], m2, 4
-pextrw [r2 + r3], m2, 6
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W2_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-mova m1, [pw_512]
-
-mov r4d, %2
-lea r5, [3 * r1]
-
-.loop:
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r5]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-lea r0, [r0 + 4 * r1]
-movd m6, [r0]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r0 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r0 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-
-pextrw [r2], m2, 0
-pextrw [r2 + r3], m2, 2
-lea r2, [r2 + 2 * r3]
-pextrw [r2], m2, 4
-pextrw [r2 + r3], m2, 6
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W2_H4 2, 8
-
-FILTER_V4_W2_H4 2, 16
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-lea r5, [r0 + 2 * r1]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r5]
-movd m5, [r5 + r1]
-
-punpcklbw m2, m3
-punpcklbw m1, m4, m5
-punpcklbw m2, m1
-
-pmaddubsw m2, m0
-
-movd m1, [r0 + 4 * r1]
-
-punpcklbw m3, m4
-punpcklbw m5, m1
-punpcklbw m3, m5
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, [pw_512]
-packuswb m2, m2
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-mova m1, [pw_512]
-lea r5, [r0 + 4 * r1]
-lea r4, [r1 * 3]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r4]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-movd m6, [r5]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r5 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r5 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-
-packuswb m2, m4
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-lea r2, [r2 + 2 * r3]
-pextrd [r2], m2, 2
-pextrd [r2 + r3], m2, 3
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W4_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-mova m1, [pw_512]
-
-mov r4d, %2
-
-lea r5, [3 * r1]
-
-.loop:
-movd m2, [r0]
-movd m3, [r0 + r1]
-movd m4, [r0 + 2 * r1]
-movd m5, [r0 + r5]
-
-punpcklbw m2, m3
-punpcklbw m6, m4, m5
-punpcklbw m2, m6
-
-pmaddubsw m2, m0
-
-lea r0, [r0 + 4 * r1]
-movd m6, [r0]
-
-punpcklbw m3, m4
-punpcklbw m7, m5, m6
-punpcklbw m3, m7
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-pmulhrsw m2, m1
-
-movd m7, [r0 + r1]
-
-punpcklbw m4, m5
-punpcklbw m3, m6, m7
-punpcklbw m4, m3
-
-pmaddubsw m4, m0
-
-movd m3, [r0 + 2 * r1]
-
-punpcklbw m5, m6
-punpcklbw m7, m3
-punpcklbw m5, m7
-
-pmaddubsw m5, m0
-
-phaddw m4, m5
-
-pmulhrsw m4, m1
-packuswb m2, m4
-movd [r2], m2
-pextrd [r2 + r3], m2, 1
-lea r2, [r2 + 2 * r3]
-pextrd [r2], m2, 2
-pextrd [r2 + r3], m2, 3
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W4_H4 4, 8
-FILTER_V4_W4_H4 4, 16
-
-FILTER_V4_W4_H4 4, 32
-
-%macro FILTER_V4_W8_H2 0
-punpcklbw m1, m2
-punpcklbw m7, m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
-%endmacro
-
-%macro FILTER_V4_W8_H3 0
-punpcklbw m2, m3
-punpcklbw m7, m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m7, m5
-
-paddw m2, m7
-
-pmulhrsw m2, m4
-packuswb m2, m2
-%endmacro
-
-%macro FILTER_V4_W8_H4 0
-punpcklbw m3, m0
-punpcklbw m7, m1, m2
-
-pmaddubsw m3, m6
-pmaddubsw m7, m5
-
-paddw m3, m7
-
-pmulhrsw m3, m4
-packuswb m3, m3
-%endmacro
-
-%macro FILTER_V4_W8_H5 0
-punpcklbw m0, m1
-punpcklbw m7, m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
-%endmacro
-
-%macro FILTER_V4_W8_8x2 2
-FILTER_V4_W8 %1, %2
-movq m0, [r0 + 4 * r1]
-
-FILTER_V4_W8_H2
-
-movh [r2 + r3], m1
-%endmacro
-
-%macro FILTER_V4_W8_8x4 2
-FILTER_V4_W8_8x2 %1, %2
-;8x3
-lea r6, [r0 + 4 * r1]
-movq m1, [r6 + r1]
-
-FILTER_V4_W8_H3
-
-movh [r2 + 2 * r3], m2
-
-;8x4
-movq m2, [r6 + 2 * r1]
-
-FILTER_V4_W8_H4
-
-lea r5, [r2 + 2 * r3]
-movh [r5 + r3], m3
-%endmacro
-
-%macro FILTER_V4_W8_8x6 2
-FILTER_V4_W8_8x4 %1, %2
-;8x5
-lea r6, [r6 + 2 * r1]
-movq m3, [r6 + r1]
-
-FILTER_V4_W8_H5
-
-movh [r2 + 4 * r3], m0
-
-;8x6
-movq m0, [r0 + 8 * r1]
-
-FILTER_V4_W8_H2
-
-lea r5, [r2 + 4 * r3]
-movh [r5 + r3], m1
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W8 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
-
-mov r4d, r4m
-
-sub r0, r1
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-lea r5, [r0 + 2 * r1]
-movq m3, [r5 + r1]
-
-punpcklbw m0, m1
-punpcklbw m4, m2, m3
-
-%ifdef PIC
-lea r6, [tab_ChromaCoeff]
-movd m5, [r6 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pmaddubsw m0, m6
-
-pshufb m5, [tab_Vm + 16]
-pmaddubsw m4, m5
-
-paddw m0, m4
-
-mova m4, [pw_512]
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movh [r2], m0
-%endmacro
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x2 8, 2
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x4 8, 4
-
-RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-FILTER_V4_W8_8x6 8, 6
-
-RET
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
-
-mov r4d, r4m
-sub r0, r1
-add r3d, r3d
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m0, [tab_Cm]
-
-movd m2, [r0]
-movd m3, [r0 + r1]
-lea r5, [r0 + 2 * r1]
-movd m4, [r5]
-movd m5, [r5 + r1]
-
-punpcklbw m2, m3
-punpcklbw m1, m4, m5
-punpcklbw m2, m1
-
-pmaddubsw m2, m0
-
-movd m1, [r0 + 4 * r1]
-
-punpcklbw m3, m4
-punpcklbw m5, m1
-punpcklbw m3, m5
-
-pmaddubsw m3, m0
-
-phaddw m2, m3
-
-psubw m2, [pw_2000]
-movh [r2], m2
-movhps [r2 + r3], m2
-
-RET
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m0, [tab_Cm]
-
- lea r4, [r1 * 3]
- lea r5, [r0 + 4 * r1]
-
- movd m2, [r0]
- movd m3, [r0 + r1]
- movd m4, [r0 + 2 * r1]
- movd m5, [r0 + r4]
-
- punpcklbw m2, m3
- punpcklbw m6, m4, m5
- punpcklbw m2, m6
-
- pmaddubsw m2, m0
-
- movd m6, [r5]
-
- punpcklbw m3, m4
- punpcklbw m1, m5, m6
- punpcklbw m3, m1
-
- pmaddubsw m3, m0
-
- phaddw m2, m3
-
- mova m1, [pw_2000]
-
- psubw m2, m1
- movh [r2], m2
- movhps [r2 + r3], m2
-
- movd m2, [r5 + r1]
-
- punpcklbw m4, m5
- punpcklbw m3, m6, m2
- punpcklbw m4, m3
-
- pmaddubsw m4, m0
-
- movd m3, [r5 + 2 * r1]
-
- punpcklbw m5, m6
- punpcklbw m2, m3
- punpcklbw m5, m2
-
- pmaddubsw m5, m0
-
- phaddw m4, m5
-
- psubw m4, m1
- lea r2, [r2 + 2 * r3]
- movh [r2], m4
- movhps [r2 + r3], m4
-
- RET
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W4_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m0, [tab_Cm]
-
- mova m1, [pw_2000]
-
- mov r4d, %2/4
- lea r5, [3 * r1]
-
-.loop:
- movd m2, [r0]
- movd m3, [r0 + r1]
- movd m4, [r0 + 2 * r1]
- movd m5, [r0 + r5]
-
- punpcklbw m2, m3
- punpcklbw m6, m4, m5
- punpcklbw m2, m6
-
- pmaddubsw m2, m0
-
- lea r0, [r0 + 4 * r1]
- movd m6, [r0]
-
- punpcklbw m3, m4
- punpcklbw m7, m5, m6
- punpcklbw m3, m7
-
- pmaddubsw m3, m0
-
- phaddw m2, m3
-
- psubw m2, m1
- movh [r2], m2
- movhps [r2 + r3], m2
-
- movd m2, [r0 + r1]
-
- punpcklbw m4, m5
- punpcklbw m3, m6, m2
- punpcklbw m4, m3
-
- pmaddubsw m4, m0
-
- movd m3, [r0 + 2 * r1]
-
- punpcklbw m5, m6
- punpcklbw m2, m3
- punpcklbw m5, m2
-
- pmaddubsw m5, m0
-
- phaddw m4, m5
-
- psubw m4, m1
- lea r2, [r2 + 2 * r3]
- movh [r2], m4
- movhps [r2 + r3], m4
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W4_H4 4, 8
-FILTER_V_PS_W4_H4 4, 16
-
-FILTER_V_PS_W4_H4 4, 32
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W8_H8_H16_H2 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m5, [r5 + r4 * 4]
-%else
- movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m6, m5, [tab_Vm]
- pshufb m5, [tab_Vm + 16]
- mova m4, [pw_2000]
-
- mov r4d, %2/2
- lea r5, [3 * r1]
-
-.loopH:
- movq m0, [r0]
- movq m1, [r0 + r1]
- movq m2, [r0 + 2 * r1]
- movq m3, [r0 + r5]
-
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
-
- pmaddubsw m0, m6
- pmaddubsw m2, m5
-
- paddw m0, m2
-
- psubw m0, m4
- movu [r2], m0
-
- movq m0, [r0 + 4 * r1]
-
- punpcklbw m3, m0
-
- pmaddubsw m1, m6
- pmaddubsw m3, m5
-
- paddw m1, m3
- psubw m1, m4
-
- movu [r2 + r3], m1
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_V_PS_W8_H8_H16_H2 8, 2
-FILTER_V_PS_W8_H8_H16_H2 8, 4
-FILTER_V_PS_W8_H8_H16_H2 8, 6
-
-FILTER_V_PS_W8_H8_H16_H2 8, 12
-FILTER_V_PS_W8_H8_H16_H2 8, 64
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W8_H8_H16_H32 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m5, [r5 + r4 * 4]
-%else
- movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m6, m5, [tab_Vm]
- pshufb m5, [tab_Vm + 16]
- mova m4, [pw_2000]
-
- mov r4d, %2/4
- lea r5, [3 * r1]
-
-.loop:
- movq m0, [r0]
- movq m1, [r0 + r1]
- movq m2, [r0 + 2 * r1]
- movq m3, [r0 + r5]
-
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
-
- pmaddubsw m0, m6
- pmaddubsw m7, m2, m5
-
- paddw m0, m7
-
- psubw m0, m4
- movu [r2], m0
-
- lea r0, [r0 + 4 * r1]
- movq m0, [r0]
-
- punpcklbw m3, m0
-
- pmaddubsw m1, m6
- pmaddubsw m7, m3, m5
-
- paddw m1, m7
-
- psubw m1, m4
- movu [r2 + r3], m1
-
- movq m1, [r0 + r1]
-
- punpcklbw m0, m1
-
- pmaddubsw m2, m6
- pmaddubsw m0, m5
-
- paddw m2, m0
-
- psubw m2, m4
- lea r2, [r2 + 2 * r3]
- movu [r2], m2
-
- movq m2, [r0 + 2 * r1]
-
- punpcklbw m1, m2
-
- pmaddubsw m3, m6
- pmaddubsw m1, m5
-
- paddw m3, m1
- psubw m3, m4
-
- movu [r2 + r3], m3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W8_H8_H16_H32 8, 8
-FILTER_V_PS_W8_H8_H16_H32 8, 16
-FILTER_V_PS_W8_H8_H16_H32 8, 32
-
-;------------------------------------------------------------------------------------------------------------
-;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W6 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m5, [r5 + r4 * 4]
-%else
- movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m6, m5, [tab_Vm]
- pshufb m5, [tab_Vm + 16]
- mova m4, [pw_2000]
- lea r5, [3 * r1]
- mov r4d, %2/4
-
-.loop:
- movq m0, [r0]
- movq m1, [r0 + r1]
- movq m2, [r0 + 2 * r1]
- movq m3, [r0 + r5]
-
- punpcklbw m0, m1
- punpcklbw m1, m2
- punpcklbw m2, m3
-
- pmaddubsw m0, m6
- pmaddubsw m7, m2, m5
-
- paddw m0, m7
- psubw m0, m4
-
- movh [r2], m0
- pshufd m0, m0, 2
- movd [r2 + 8], m0
-
- lea r0, [r0 + 4 * r1]
- movq m0, [r0]
- punpcklbw m3, m0
-
- pmaddubsw m1, m6
- pmaddubsw m7, m3, m5
-
- paddw m1, m7
- psubw m1, m4
-
- movh [r2 + r3], m1
- pshufd m1, m1, 2
- movd [r2 + r3 + 8], m1
-
- movq m1, [r0 + r1]
- punpcklbw m0, m1
-
- pmaddubsw m2, m6
- pmaddubsw m0, m5
-
- paddw m2, m0
- psubw m2, m4
-
- lea r2,[r2 + 2 * r3]
- movh [r2], m2
- pshufd m2, m2, 2
- movd [r2 + 8], m2
-
- movq m2,[r0 + 2 * r1]
- punpcklbw m1, m2
-
- pmaddubsw m3, m6
- pmaddubsw m1, m5
-
- paddw m3, m1
- psubw m3, m4
-
- movh [r2 + r3], m3
- pshufd m3, m3, 2
- movd [r2 + r3 + 8], m3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W6 6, 8
-FILTER_V_PS_W6 6, 16
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W12 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m1, m0, [tab_Vm]
- pshufb m0, [tab_Vm + 16]
-
- mov r4d, %2/2
-
-.loop:
- movu m2, [r0]
- movu m3, [r0 + r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r0, [r0 + 2 * r1]
- movu m5, [r0]
- movu m7, [r0 + r1]
-
- punpcklbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m4, m6
-
- punpckhbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m2, m6
-
- mova m6, [pw_2000]
-
- psubw m4, m6
- psubw m2, m6
-
- movu [r2], m4
- movh [r2 + 16], m2
-
- punpcklbw m4, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m4, m1
- pmaddubsw m3, m1
-
- movu m2, [r0 + 2 * r1]
-
- punpcklbw m5, m7, m2
- punpckhbw m7, m2
-
- pmaddubsw m5, m0
- pmaddubsw m7, m0
-
- paddw m4, m5
- paddw m3, m7
-
- psubw m4, m6
- psubw m3, m6
-
- movu [r2 + r3], m4
- movh [r2 + r3 + 16], m3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W12 12, 16
-FILTER_V_PS_W12 12, 32
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W16 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m1, m0, [tab_Vm]
- pshufb m0, [tab_Vm + 16]
- mov r4d, %2/2
-
-.loop:
- movu m2, [r0]
- movu m3, [r0 + r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r0, [r0 + 2 * r1]
- movu m5, [r0]
- movu m7, [r0 + r1]
-
- punpcklbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m4, m6
-
- punpckhbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m2, m6
-
- mova m6, [pw_2000]
-
- psubw m4, m6
- psubw m2, m6
-
- movu [r2], m4
- movu [r2 + 16], m2
-
- punpcklbw m4, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m4, m1
- pmaddubsw m3, m1
-
- movu m5, [r0 + 2 * r1]
-
- punpcklbw m2, m7, m5
- punpckhbw m7, m5
-
- pmaddubsw m2, m0
- pmaddubsw m7, m0
-
- paddw m4, m2
- paddw m3, m7
-
- psubw m4, m6
- psubw m3, m6
-
- movu [r2 + r3], m4
- movu [r2 + r3 + 16], m3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W16 16, 4
-FILTER_V_PS_W16 16, 8
-FILTER_V_PS_W16 16, 12
-FILTER_V_PS_W16 16, 16
-FILTER_V_PS_W16 16, 32
-
-FILTER_V_PS_W16 16, 24
-FILTER_V_PS_W16 16, 64
-
-;--------------------------------------------------------------------------------------------------------------
-;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_V4_PS_W24 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m1, m0, [tab_Vm]
- pshufb m0, [tab_Vm + 16]
-
- mov r4d, %2/2
-
-.loop:
- movu m2, [r0]
- movu m3, [r0 + r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r5, [r0 + 2 * r1]
-
- movu m5, [r5]
- movu m7, [r5 + r1]
-
- punpcklbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m4, m6
-
- punpckhbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m2, m6
-
- mova m6, [pw_2000]
-
- psubw m4, m6
- psubw m2, m6
-
- movu [r2], m4
- movu [r2 + 16], m2
-
- punpcklbw m4, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m4, m1
- pmaddubsw m3, m1
-
- movu m2, [r5 + 2 * r1]
-
- punpcklbw m5, m7, m2
- punpckhbw m7, m2
-
- pmaddubsw m5, m0
- pmaddubsw m7, m0
-
- paddw m4, m5
- paddw m3, m7
-
- psubw m4, m6
- psubw m3, m6
-
- movu [r2 + r3], m4
- movu [r2 + r3 + 16], m3
-
- movq m2, [r0 + 16]
- movq m3, [r0 + r1 + 16]
- movq m4, [r5 + 16]
- movq m5, [r5 + r1 + 16]
-
- punpcklbw m2, m3
- punpcklbw m7, m4, m5
-
- pmaddubsw m2, m1
- pmaddubsw m7, m0
-
- paddw m2, m7
- psubw m2, m6
-
- movu [r2 + 32], m2
-
- movq m2, [r5 + 2 * r1 + 16]
-
- punpcklbw m3, m4
- punpcklbw m5, m2
-
- pmaddubsw m3, m1
- pmaddubsw m5, m0
-
- paddw m3, m5
- psubw m3, m6
-
- movu [r2 + r3 + 32], m3
-
- mov r0, r5
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V4_PS_W24 24, 32
-
-FILTER_V4_PS_W24 24, 64
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W32 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m1, m0, [tab_Vm]
- pshufb m0, [tab_Vm + 16]
-
- mova m7, [pw_2000]
-
- mov r4d, %2
-
-.loop:
- movu m2, [r0]
- movu m3, [r0 + r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r5, [r0 + 2 * r1]
- movu m3, [r5]
- movu m5, [r5 + r1]
-
- punpcklbw m6, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m6, m0
- pmaddubsw m3, m0
-
- paddw m4, m6
- paddw m2, m3
-
- psubw m4, m7
- psubw m2, m7
-
- movu [r2], m4
- movu [r2 + 16], m2
-
- movu m2, [r0 + 16]
- movu m3, [r0 + r1 + 16]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- movu m3, [r5 + 16]
- movu m5, [r5 + r1 + 16]
-
- punpcklbw m6, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m6, m0
- pmaddubsw m3, m0
-
- paddw m4, m6
- paddw m2, m3
-
- psubw m4, m7
- psubw m2, m7
-
- movu [r2 + 32], m4
- movu [r2 + 48], m2
-
- lea r0, [r0 + r1]
- lea r2, [r2 + r3]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W32 32, 8
-FILTER_V_PS_W32 32, 16
-FILTER_V_PS_W32 32, 24
-FILTER_V_PS_W32 32, 32
-
-FILTER_V_PS_W32 32, 48
-FILTER_V_PS_W32 32, 64
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W8_H8_H16_H32 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m5, [r5 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pshufb m5, [tab_Vm + 16]
-mova m4, [pw_512]
-lea r5, [r1 * 3]
-
-mov r4d, %2
-
-.loop:
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-movq m3, [r0 + r5]
-
-punpcklbw m0, m1
-punpcklbw m1, m2
-punpcklbw m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m2, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movh [r2], m0
-
-lea r0, [r0 + 4 * r1]
-movq m0, [r0]
-
-punpcklbw m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m3, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
-movh [r2 + r3], m1
-
-movq m1, [r0 + r1]
-
-punpcklbw m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m0, m5
-
-paddw m2, m0
-
-pmulhrsw m2, m4
-
-movq m7, [r0 + 2 * r1]
-punpcklbw m1, m7
-
-pmaddubsw m3, m6
-pmaddubsw m1, m5
-
-paddw m3, m1
-
-pmulhrsw m3, m4
-packuswb m2, m3
-
-lea r2, [r2 + 2 * r3]
-movh [r2], m2
-movhps [r2 + r3], m2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W8_H8_H16_H32 8, 8
-FILTER_V4_W8_H8_H16_H32 8, 16
-FILTER_V4_W8_H8_H16_H32 8, 32
-
-FILTER_V4_W8_H8_H16_H32 8, 12
-FILTER_V4_W8_H8_H16_H32 8, 64
-
-
-;-----------------------------------------------------------------------------
-;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W6_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m5, [r5 + r4 * 4]
-%else
-movd m5, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m6, m5, [tab_Vm]
-pshufb m5, [tab_Vm + 16]
-mova m4, [pw_512]
-
-mov r4d, %2
-lea r5, [3 * r1]
-
-.loop:
-movq m0, [r0]
-movq m1, [r0 + r1]
-movq m2, [r0 + 2 * r1]
-movq m3, [r0 + r5]
-
-punpcklbw m0, m1
-punpcklbw m1, m2
-punpcklbw m2, m3
-
-pmaddubsw m0, m6
-pmaddubsw m7, m2, m5
-
-paddw m0, m7
-
-pmulhrsw m0, m4
-packuswb m0, m0
-movd [r2], m0
-pextrw [r2 + 4], m0, 2
-
-lea r0, [r0 + 4 * r1]
-
-movq m0, [r0]
-punpcklbw m3, m0
-
-pmaddubsw m1, m6
-pmaddubsw m7, m3, m5
-
-paddw m1, m7
-
-pmulhrsw m1, m4
-packuswb m1, m1
-movd [r2 + r3], m1
-pextrw [r2 + r3 + 4], m1, 2
-
-movq m1, [r0 + r1]
-punpcklbw m7, m0, m1
-
-pmaddubsw m2, m6
-pmaddubsw m7, m5
-
-paddw m2, m7
-
-pmulhrsw m2, m4
-packuswb m2, m2
-lea r2, [r2 + 2 * r3]
-movd [r2], m2
-pextrw [r2 + 4], m2, 2
-
-movq m2, [r0 + 2 * r1]
-punpcklbw m1, m2
-
-pmaddubsw m3, m6
-pmaddubsw m1, m5
-
-paddw m3, m1
-
-pmulhrsw m3, m4
-packuswb m3, m3
-
-movd [r2 + r3], m3
-pextrw [r2 + r3 + 4], m3, 2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W6_H4 6, 8
-
-FILTER_V4_W6_H4 6, 16
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W12_H2 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r0, [r0 + 2 * r1]
-movu m5, [r0]
-movu m7, [r0 + r1]
-
-punpcklbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m4, m6
-
-punpckhbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m2, m6
-
-mova m6, [pw_512]
-
-pmulhrsw m4, m6
-pmulhrsw m2, m6
-
-packuswb m4, m2
-
-movh [r2], m4
-pextrd [r2 + 8], m4, 2
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r0 + 2 * r1]
-
-punpcklbw m2, m7, m5
-punpckhbw m7, m5
-
-pmaddubsw m2, m0
-pmaddubsw m7, m0
-
-paddw m4, m2
-paddw m3, m7
-
-pmulhrsw m4, m6
-pmulhrsw m3, m6
-
-packuswb m4, m3
-
-movh [r2 + r3], m4
-pextrd [r2 + r3 + 8], m4, 2
-
-lea r2, [r2 + 2 * r3]
-
-sub r4, 2
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W12_H2 12, 16
-
-FILTER_V4_W12_H2 12, 32
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W16_H2 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2/2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r0, [r0 + 2 * r1]
-movu m5, [r0]
-movu m6, [r0 + r1]
-
-punpckhbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m2, m7
-
-punpcklbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m4, m7
-
-mova m7, [pw_512]
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r0 + 2 * r1]
-
-punpcklbw m2, m6, m5
-punpckhbw m6, m5
-
-pmaddubsw m2, m0
-pmaddubsw m6, m0
-
-paddw m4, m2
-paddw m3, m6
-
-pmulhrsw m4, m7
-pmulhrsw m3, m7
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-lea r2, [r2 + 2 * r3]
-
-dec r4d
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W16_H2 16, 4
-FILTER_V4_W16_H2 16, 8
-FILTER_V4_W16_H2 16, 12
-FILTER_V4_W16_H2 16, 16
-FILTER_V4_W16_H2 16, 32
-
-FILTER_V4_W16_H2 16, 24
-FILTER_V4_W16_H2 16, 64
-
-;-----------------------------------------------------------------------------
-;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W24 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5]
-movu m7, [r5 + r1]
-
-punpcklbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m4, m6
-
-punpckhbw m6, m5, m7
-pmaddubsw m6, m0
-paddw m2, m6
-
-mova m6, [pw_512]
-
-pmulhrsw m4, m6
-pmulhrsw m2, m6
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m2, [r5 + 2 * r1]
-
-punpcklbw m5, m7, m2
-punpckhbw m7, m2
-
-pmaddubsw m5, m0
-pmaddubsw m7, m0
-
-paddw m4, m5
-paddw m3, m7
-
-pmulhrsw m4, m6
-pmulhrsw m3, m6
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-movq m2, [r0 + 16]
-movq m3, [r0 + r1 + 16]
-movq m4, [r5 + 16]
-movq m5, [r5 + r1 + 16]
-
-punpcklbw m2, m3
-punpcklbw m4, m5
-
-pmaddubsw m2, m1
-pmaddubsw m4, m0
-
-paddw m2, m4
-
-pmulhrsw m2, m6
-
-movq m3, [r0 + r1 + 16]
-movq m4, [r5 + 16]
-movq m5, [r5 + r1 + 16]
-movq m7, [r5 + 2 * r1 + 16]
-
-punpcklbw m3, m4
-punpcklbw m5, m7
-
-pmaddubsw m3, m1
-pmaddubsw m5, m0
-
-paddw m3, m5
-
-pmulhrsw m3, m6
-packuswb m2, m3
-
-movh [r2 + 16], m2
-movhps [r2 + r3 + 16], m2
-
-mov r0, r5
-lea r2, [r2 + 2 * r3]
-
-sub r4, 2
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W24 24, 32
-
-FILTER_V4_W24 24, 64
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W32 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mova m7, [pw_512]
-
-mov r4d, %2
-
-.loop:
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m3, [r5]
-movu m5, [r5 + r1]
-
-punpcklbw m6, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m6, m0
-pmaddubsw m3, m0
-
-paddw m4, m6
-paddw m2, m3
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-movu m2, [r0 + 16]
-movu m3, [r0 + r1 + 16]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-movu m3, [r5 + 16]
-movu m5, [r5 + r1 + 16]
-
-punpcklbw m6, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m6, m0
-pmaddubsw m3, m0
-
-paddw m4, m6
-paddw m2, m3
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2 + 16], m4
-
-lea r0, [r0 + r1]
-lea r2, [r2 + r3]
-
-dec r4
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W32 32, 8
-FILTER_V4_W32 32, 16
-FILTER_V4_W32 32, 24
-FILTER_V4_W32 32, 32
-
-FILTER_V4_W32 32, 48
-FILTER_V4_W32 32, 64
-
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W16n_H2 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
-
-mov r4d, r4m
-sub r0, r1
-
-%ifdef PIC
-lea r5, [tab_ChromaCoeff]
-movd m0, [r5 + r4 * 4]
-%else
-movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
-pshufb m1, m0, [tab_Vm]
-pshufb m0, [tab_Vm + 16]
-
-mov r4d, %2/2
-
-.loop:
-
-mov r6d, %1/16
-
-.loopW:
-
-movu m2, [r0]
-movu m3, [r0 + r1]
-
-punpcklbw m4, m2, m3
-punpckhbw m2, m3
-
-pmaddubsw m4, m1
-pmaddubsw m2, m1
-
-lea r5, [r0 + 2 * r1]
-movu m5, [r5]
-movu m6, [r5 + r1]
-
-punpckhbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m2, m7
-
-punpcklbw m7, m5, m6
-pmaddubsw m7, m0
-paddw m4, m7
-
-mova m7, [pw_512]
-
-pmulhrsw m4, m7
-pmulhrsw m2, m7
-
-packuswb m4, m2
-
-movu [r2], m4
-
-punpcklbw m4, m3, m5
-punpckhbw m3, m5
-
-pmaddubsw m4, m1
-pmaddubsw m3, m1
-
-movu m5, [r5 + 2 * r1]
-
-punpcklbw m2, m6, m5
-punpckhbw m6, m5
-
-pmaddubsw m2, m0
-pmaddubsw m6, m0
-
-paddw m4, m2
-paddw m3, m6
-
-pmulhrsw m4, m7
-pmulhrsw m3, m7
-
-packuswb m4, m3
-
-movu [r2 + r3], m4
-
-add r0, 16
-add r2, 16
-dec r6d
-jnz .loopW
-
-lea r0, [r0 + r1 * 2 - %1]
-lea r2, [r2 + r3 * 2 - %1]
-
-dec r4d
-jnz .loop
-RET
-%endmacro
-
-FILTER_V4_W16n_H2 64, 64
-FILTER_V4_W16n_H2 64, 32
-FILTER_V4_W16n_H2 64, 48
-FILTER_V4_W16n_H2 48, 64
-FILTER_V4_W16n_H2 64, 16
-
-
-;-----------------------------------------------------------------------------
-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal luma_p2s, 3, 7, 6
-
- ; load width and height
- mov r3d, r3m
- mov r4d, r4m
-
- ; load constant
- mova m4, [pb_128]
- mova m5, [tab_c_64_n64]
-
-.loopH:
-
- xor r5d, r5d
-.loopW:
- lea r6, [r0 + r5]
-
- movh m0, [r6]
- punpcklbw m0, m4
- pmaddubsw m0, m5
-
- movh m1, [r6 + r1]
- punpcklbw m1, m4
- pmaddubsw m1, m5
-
- movh m2, [r6 + r1 * 2]
- punpcklbw m2, m4
- pmaddubsw m2, m5
-
- lea r6, [r6 + r1 * 2]
- movh m3, [r6 + r1]
- punpcklbw m3, m4
- pmaddubsw m3, m5
-
- add r5, 8
- cmp r5, r3
- jg .width4
- movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
- movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
- movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
- movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
- je .nextH
- jmp .loopW
-
-.width4:
- movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
- movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
- movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
- movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
-
-.nextH:
- lea r0, [r0 + r1 * 4]
- add r2, FENC_STRIDE * 8
-
- sub r4d, 4
- jnz .loopH
-
- RET
-
-%macro PROCESS_LUMA_W4_4R 0
- movd m0, [r0]
- movd m1, [r0 + r1]
- punpcklbw m2, m0, m1 ; m2=[0 1]
-
- lea r0, [r0 + 2 * r1]
- movd m0, [r0]
- punpcklbw m1, m0 ; m1=[1 2]
- punpcklqdq m2, m1 ; m2=[0 1 1 2]
- pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
-
- movd m1, [r0 + r1]
- punpcklbw m5, m0, m1 ; m2=[2 3]
- lea r0, [r0 + 2 * r1]
- movd m0, [r0]
- punpcklbw m1, m0 ; m1=[3 4]
- punpcklqdq m5, m1 ; m5=[2 3 3 4]
- pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
- paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
- pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
-
- movd m1, [r0 + r1]
- punpcklbw m2, m0, m1 ; m2=[4 5]
- lea r0, [r0 + 2 * r1]
- movd m0, [r0]
- punpcklbw m1, m0 ; m1=[5 6]
- punpcklqdq m2, m1 ; m2=[4 5 5 6]
- pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
- paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
- pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
- paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
-
- movd m1, [r0 + r1]
- punpcklbw m2, m0, m1 ; m2=[6 7]
- lea r0, [r0 + 2 * r1]
- movd m0, [r0]
- punpcklbw m1, m0 ; m1=[7 8]
- punpcklqdq m2, m1 ; m2=[6 7 7 8]
- pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
- paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
- pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
- paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
-
- movd m1, [r0 + r1]
- punpcklbw m2, m0, m1 ; m2=[8 9]
- movd m0, [r0 + 2 * r1]
- punpcklbw m1, m0 ; m1=[9 10]
- punpcklqdq m2, m1 ; m2=[8 9 9 10]
- pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
- paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
-%endmacro
-
-%macro PROCESS_LUMA_W8_4R 0
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklbw m0, m1
- pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m0, [r0]
- punpcklbw m1, m0
- pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
-
- movq m1, [r0 + r1]
- punpcklbw m0, m1
- pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
- pmaddubsw m0, [r6 + 1 * 16]
- paddw m7, m0 ;m7=[0+1+2+3] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m0, [r0]
- punpcklbw m1, m0
- pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
- pmaddubsw m1, [r6 + 1 * 16]
- paddw m6, m1 ;m6 = [1+2+3+4] Row2
-
- movq m1, [r0 + r1]
- punpcklbw m0, m1
- pmaddubsw m2, m0, [r6 + 1 * 16]
- pmaddubsw m0, [r6 + 2 * 16]
- paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
- paddw m5, m2 ;m5=[2+3+4+5] Row3
-
- lea r0, [r0 + 2 * r1]
- movq m0, [r0]
- punpcklbw m1, m0
- pmaddubsw m2, m1, [r6 + 1 * 16]
- pmaddubsw m1, [r6 + 2 * 16]
- paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
- paddw m4, m2 ;m4=[3+4+5+6] Row4
-
- movq m1, [r0 + r1]
- punpcklbw m0, m1
- pmaddubsw m2, m0, [r6 + 2 * 16]
- pmaddubsw m0, [r6 + 3 * 16]
- paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
- paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
-
- lea r0, [r0 + 2 * r1]
- movq m0, [r0]
- punpcklbw m1, m0
- pmaddubsw m2, m1, [r6 + 2 * 16]
- pmaddubsw m1, [r6 + 3 * 16]
- paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
- paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
-
- movq m1, [r0 + r1]
- punpcklbw m0, m1
- pmaddubsw m0, [r6 + 3 * 16]
- paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
-
- movq m0, [r0 + 2 * r1]
- punpcklbw m1, m0
- pmaddubsw m1, [r6 + 3 * 16]
- paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
-%endmacro
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_4xN 3
-INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
- lea r5, [3 * r1]
- sub r0, r5
- shl r4d, 6
-%ifidn %3,ps
- add r3d, r3d
-%endif
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffVer]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffVer + r4]
-%endif
-
-%ifidn %3,pp
- mova m3, [pw_512]
-%else
- mova m3, [pw_2000]
-%endif
-
- mov r4d, %2/4
- lea r5, [4 * r1]
-
-.loopH:
- PROCESS_LUMA_W4_4R
-
-%ifidn %3,pp
- pmulhrsw m4, m3
- pmulhrsw m5, m3
-
- packuswb m4, m5
-
- movd [r2], m4
- pextrd [r2 + r3], m4, 1
- lea r2, [r2 + 2 * r3]
- pextrd [r2], m4, 2
- pextrd [r2 + r3], m4, 3
-%else
- psubw m4, m3
- psubw m5, m3
-
- movlps [r2], m4
- movhps [r2 + r3], m4
- lea r2, [r2 + 2 * r3]
- movlps [r2], m5
- movhps [r2 + r3], m5
-%endif
-
- sub r0, r5
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-
-INIT_YMM avx2
-cglobal interp_8tap_vert_pp_4x4, 4,6,8
- mov r4d, r4m
- lea r5, [r1 * 3]
- sub r0, r5
-
- ; TODO: VPGATHERDD
- movd xm1, [r0] ; m1 = row0
- movd xm2, [r0 + r1] ; m2 = row1
- punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
-
- movd xm3, [r0 + r1 * 2] ; m3 = row2
- punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
- movd xm4, [r0 + r5]
- punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
- punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
-
- lea r0, [r0 + r1 * 4]
- movd xm5, [r0] ; m5 = row4
- punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
- punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
- vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
- movd xm2, [r0 + r1] ; m2 = row5
- punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
- punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
- movd xm6, [r0 + r1 * 2] ; m6 = row6
- punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
- punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
- vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
- movd xm4, [r0 + r5] ; m4 = row7
- punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
- punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
-
- lea r0, [r0 + r1 * 4]
- movd xm7, [r0] ; m7 = row8
- punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
- punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
- vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
- movd xm2, [r0 + r1] ; m2 = row9
- punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
- punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
- movd xm7, [r0 + r1 * 2] ; m7 = rowA
- punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
- punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
- vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
-
- ; load filter coeff
-%ifdef PIC
- lea r5, [tab_LumaCoeff]
- vpbroadcastd m0, [r5 + r4 * 8 + 0]
- vpbroadcastd m2, [r5 + r4 * 8 + 4]
-%else
- vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
- vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
-%endif
-
- pmaddubsw m1, m0
- pmaddubsw m3, m0
- pmaddubsw m5, m2
- pmaddubsw m6, m2
- vbroadcasti128 m0, [pw_1]
- pmaddwd m1, m0
- pmaddwd m3, m0
- pmaddwd m5, m0
- pmaddwd m6, m0
- paddd m1, m5 ; m1 = DQWORD ROW[1 0]
- paddd m3, m6 ; m3 = DQWORD ROW[3 2]
- packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
-
- ; TODO: does it overflow?
- pmulhrsw m1, [pw_512]
- vextracti128 xm2, m1, 1
- packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
- movd [r2], xm1
- pextrd [r2 + r3], xm1, 2
- pextrd [r2 + r3 * 2], xm1, 1
- lea r4, [r3 * 3]
- pextrd [r2 + r4], xm1, 3
- RET
-
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 4, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 8, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 16, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 4, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 8, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4, 16, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_8xN 3
-INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
- lea r5, [3 * r1]
- sub r0, r5
- shl r4d, 6
-
-%ifidn %3,ps
- add r3d, r3d
-%endif
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffVer]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffVer + r4]
-%endif
-
- %ifidn %3,pp
- mova m3, [pw_512]
-%else
- mova m3, [pw_2000]
-%endif
-
- mov r4d, %2/4
- lea r5, [4 * r1]
-
-.loopH:
- PROCESS_LUMA_W8_4R
-
-%ifidn %3,pp
- pmulhrsw m7, m3
- pmulhrsw m6, m3
- pmulhrsw m5, m3
- pmulhrsw m4, m3
-
- packuswb m7, m6
- packuswb m5, m4
-
- movlps [r2], m7
- movhps [r2 + r3], m7
- lea r2, [r2 + 2 * r3]
- movlps [r2], m5
- movhps [r2 + r3], m5
-%else
- psubw m7, m3
- psubw m6, m3
- psubw m5, m3
- psubw m4, m3
-
- movu [r2], m7
- movu [r2 + r3], m6
- lea r2, [r2 + 2 * r3]
- movu [r2], m5
- movu [r2 + r3], m4
-%endif
-
- sub r0, r5
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 4, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 8, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 16, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 32, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 4, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 8, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 16, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8, 32, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_12xN 3
-INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
- lea r5, [3 * r1]
- sub r0, r5
- shl r4d, 6
-%ifidn %3,ps
- add r3d, r3d
-%endif
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffVer]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffVer + r4]
-%endif
-
- %ifidn %3,pp
- mova m3, [pw_512]
-%else
- mova m3, [pw_2000]
-%endif
-
- mov r4d, %2/4
-
-.loopH:
- PROCESS_LUMA_W8_4R
-
-%ifidn %3,pp
- pmulhrsw m7, m3
- pmulhrsw m6, m3
- pmulhrsw m5, m3
- pmulhrsw m4, m3
-
- packuswb m7, m6
- packuswb m5, m4
-
- movlps [r2], m7
- movhps [r2 + r3], m7
- lea r5, [r2 + 2 * r3]
- movlps [r5], m5
- movhps [r5 + r3], m5
-%else
- psubw m7, m3
- psubw m6, m3
- psubw m5, m3
- psubw m4, m3
-
- movu [r2], m7
- movu [r2 + r3], m6
- lea r5, [r2 + 2 * r3]
- movu [r5], m5
- movu [r5 + r3], m4
-%endif
-
- lea r5, [8 * r1 - 8]
- sub r0, r5
-%ifidn %3,pp
- add r2, 8
-%else
- add r2, 16
-%endif
-
- PROCESS_LUMA_W4_4R
-
-%ifidn %3,pp
- pmulhrsw m4, m3
- pmulhrsw m5, m3
-
- packuswb m4, m5
-
- movd [r2], m4
- pextrd [r2 + r3], m4, 1
- lea r5, [r2 + 2 * r3]
- pextrd [r5], m4, 2
- pextrd [r5 + r3], m4, 3
-%else
- psubw m4, m3
- psubw m5, m3
-
- movlps [r2], m4
- movhps [r2 + r3], m4
- lea r5, [r2 + 2 * r3]
- movlps [r5], m5
- movhps [r5 + r3], m5
-%endif
-
- lea r5, [4 * r1 + 8]
- sub r0, r5
-%ifidn %3,pp
- lea r2, [r2 + 4 * r3 - 8]
-%else
- lea r2, [r2 + 4 * r3 - 16]
-%endif
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_12xN 12, 16, pp
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_12xN 12, 16, ps
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA 3
-INIT_XMM sse4
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
- lea r5, [3 * r1]
- sub r0, r5
- shl r4d, 6
-%ifidn %3,ps
- add r3d, r3d
-%endif
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffVer]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffVer + r4]
-%endif
-
-%ifidn %3,pp
- mova m3, [pw_512]
-%else
- mova m3, [pw_2000]
-%endif
- mov dword [rsp], %2/4
-
-.loopH:
- mov r4d, (%1/8)
-.loopW:
- PROCESS_LUMA_W8_4R
-%ifidn %3,pp
- pmulhrsw m7, m3
- pmulhrsw m6, m3
- pmulhrsw m5, m3
- pmulhrsw m4, m3
-
- packuswb m7, m6
- packuswb m5, m4
-
- movlps [r2], m7
- movhps [r2 + r3], m7
- lea r5, [r2 + 2 * r3]
- movlps [r5], m5
- movhps [r5 + r3], m5
-%else
- psubw m7, m3
- psubw m6, m3
- psubw m5, m3
- psubw m4, m3
-
- movu [r2], m7
- movu [r2 + r3], m6
- lea r5, [r2 + 2 * r3]
- movu [r5], m5
- movu [r5 + r3], m4
-%endif
-
- lea r5, [8 * r1 - 8]
- sub r0, r5
-%ifidn %3,pp
- add r2, 8
-%else
- add r2, 16
-%endif
- dec r4d
- jnz .loopW
-
- lea r0, [r0 + 4 * r1 - %1]
-%ifidn %3,pp
- lea r2, [r2 + 4 * r3 - %1]
-%else
- lea r2, [r2 + 4 * r3 - 2 * %1]
-%endif
-
- dec dword [rsp]
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_LUMA 16, 4, pp
-FILTER_VER_LUMA 16, 8, pp
-FILTER_VER_LUMA 16, 12, pp
-FILTER_VER_LUMA 16, 16, pp
-FILTER_VER_LUMA 16, 32, pp
-FILTER_VER_LUMA 16, 64, pp
-FILTER_VER_LUMA 24, 32, pp
-FILTER_VER_LUMA 32, 8, pp
-FILTER_VER_LUMA 32, 16, pp
-FILTER_VER_LUMA 32, 24, pp
-FILTER_VER_LUMA 32, 32, pp
-FILTER_VER_LUMA 32, 64, pp
-FILTER_VER_LUMA 48, 64, pp
-FILTER_VER_LUMA 64, 16, pp
-FILTER_VER_LUMA 64, 32, pp
-FILTER_VER_LUMA 64, 48, pp
-FILTER_VER_LUMA 64, 64, pp
-
-FILTER_VER_LUMA 16, 4, ps
-FILTER_VER_LUMA 16, 8, ps
-FILTER_VER_LUMA 16, 12, ps
-FILTER_VER_LUMA 16, 16, ps
-FILTER_VER_LUMA 16, 32, ps
-FILTER_VER_LUMA 16, 64, ps
-FILTER_VER_LUMA 24, 32, ps
-FILTER_VER_LUMA 32, 8, ps
-FILTER_VER_LUMA 32, 16, ps
-FILTER_VER_LUMA 32, 24, ps
-FILTER_VER_LUMA 32, 32, ps
-FILTER_VER_LUMA 32, 64, ps
-FILTER_VER_LUMA 48, 64, ps
-FILTER_VER_LUMA 64, 16, ps
-FILTER_VER_LUMA 64, 32, ps
-FILTER_VER_LUMA 64, 48, ps
-FILTER_VER_LUMA 64, 64, ps
-
-%macro PROCESS_LUMA_SP_W4_4R 0
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m1, m4 ;m1=[1 2]
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[2 3]
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
- pmaddwd m4, [r6 + 1 * 16]
- paddd m0, m4 ;m0=[0+1+2+3] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[3 4]
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
- pmaddwd m5, [r6 + 1 * 16]
- paddd m1, m5 ;m1 = [1+2+3+4] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[4 5]
- pmaddwd m6, m4, [r6 + 1 * 16]
- paddd m2, m6 ;m2=[2+3+4+5] Row3
- pmaddwd m4, [r6 + 2 * 16]
- paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[5 6]
- pmaddwd m6, m5, [r6 + 1 * 16]
- paddd m3, m6 ;m3=[3+4+5+6] Row4
- pmaddwd m5, [r6 + 2 * 16]
- paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[6 7]
- pmaddwd m6, m4, [r6 + 2 * 16]
- paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
- pmaddwd m4, [r6 + 3 * 16]
- paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[7 8]
- pmaddwd m6, m5, [r6 + 2 * 16]
- paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
- pmaddwd m5, [r6 + 3 * 16]
- paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[8 9]
- pmaddwd m4, [r6 + 3 * 16]
- paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
-
- movq m4, [r0 + 2 * r1]
- punpcklwd m5, m4 ;m5=[9 10]
- pmaddwd m5, [r6 + 3 * 16]
- paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
-%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_SP 2
-INIT_XMM sse4
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
-
- add r1d, r1d
- lea r5, [r1 + 2 * r1]
- sub r0, r5
- shl r4d, 6
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffV + r4]
-%endif
-
- mova m7, [tab_c_526336]
-
- mov dword [rsp], %2/4
-.loopH:
- mov r4d, (%1/4)
-.loopW:
- PROCESS_LUMA_SP_W4_4R
-
- paddd m0, m7
- paddd m1, m7
- paddd m2, m7
- paddd m3, m7
-
- psrad m0, 12
- psrad m1, 12
- psrad m2, 12
- psrad m3, 12
-
- packssdw m0, m1
- packssdw m2, m3
-
- packuswb m0, m2
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
- lea r5, [r2 + 2 * r3]
- pextrd [r5], m0, 2
- pextrd [r5 + r3], m0, 3
-
- lea r5, [8 * r1 - 2 * 4]
- sub r0, r5
- add r2, 4
-
- dec r4d
- jnz .loopW
-
- lea r0, [r0 + 4 * r1 - 2 * %1]
- lea r2, [r2 + 4 * r3 - %1]
-
- dec dword [rsp]
- jnz .loopH
-
- RET
-%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
- FILTER_VER_LUMA_SP 4, 4
- FILTER_VER_LUMA_SP 8, 8
- FILTER_VER_LUMA_SP 8, 4
- FILTER_VER_LUMA_SP 4, 8
- FILTER_VER_LUMA_SP 16, 16
- FILTER_VER_LUMA_SP 16, 8
- FILTER_VER_LUMA_SP 8, 16
- FILTER_VER_LUMA_SP 16, 12
- FILTER_VER_LUMA_SP 12, 16
- FILTER_VER_LUMA_SP 16, 4
- FILTER_VER_LUMA_SP 4, 16
- FILTER_VER_LUMA_SP 32, 32
- FILTER_VER_LUMA_SP 32, 16
- FILTER_VER_LUMA_SP 16, 32
- FILTER_VER_LUMA_SP 32, 24
- FILTER_VER_LUMA_SP 24, 32
- FILTER_VER_LUMA_SP 32, 8
- FILTER_VER_LUMA_SP 8, 32
- FILTER_VER_LUMA_SP 64, 64
- FILTER_VER_LUMA_SP 64, 32
- FILTER_VER_LUMA_SP 32, 64
- FILTER_VER_LUMA_SP 64, 48
- FILTER_VER_LUMA_SP 48, 64
- FILTER_VER_LUMA_SP 64, 16
- FILTER_VER_LUMA_SP 16, 64
-
-; TODO: combin of U and V is more performance, but need more register
-; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
-INIT_XMM ssse3
-cglobal chroma_p2s, 3, 7, 4
-
- ; load width and height
- mov r3d, r3m
- mov r4d, r4m
-
- ; load constant
- mova m2, [pb_128]
- mova m3, [tab_c_64_n64]
-
-.loopH:
-
- xor r5d, r5d
-.loopW:
- lea r6, [r0 + r5]
-
- movh m0, [r6]
- punpcklbw m0, m2
- pmaddubsw m0, m3
-
- movh m1, [r6 + r1]
- punpcklbw m1, m2
- pmaddubsw m1, m3
-
- add r5d, 8
- cmp r5d, r3d
- lea r6, [r2 + r5 * 2]
- jg .width4
- movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
- je .nextH
- jmp .loopW
-
-.width4:
- test r3d, 4
- jz .width2
- test r3d, 2
- movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
- lea r6, [r6 + 8]
- pshufd m0, m0, 2
- pshufd m1, m1, 2
- jz .nextH
-
-.width2:
- movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
- movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
-
-.nextH:
- lea r0, [r0 + r1 * 2]
- add r2, FENC_STRIDE / 2 * 4
-
- sub r4d, 2
- jnz .loopH
-
- RET
-
-%macro PROCESS_CHROMA_SP_W4_4R 0
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m1, m4 ;m1=[1 2]
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[2 3]
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
- pmaddwd m4, [r6 + 1 * 16]
- paddd m0, m4 ;m0=[0+1+2+3] Row1 done
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[3 4]
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
- pmaddwd m5, [r6 + 1 * 16]
- paddd m1, m5 ;m1 = [1+2+3+4] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[4 5]
- pmaddwd m4, [r6 + 1 * 16]
- paddd m2, m4 ;m2=[2+3+4+5] Row3
-
- movq m4, [r0 + 2 * r1]
- punpcklwd m5, m4 ;m5=[5 6]
- pmaddwd m5, [r6 + 1 * 16]
- paddd m3, m5 ;m3=[3+4+5+6] Row4
-%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SP 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
-
- add r1d, r1d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_ChromaCoeffV + r4]
-%endif
-
- mova m6, [tab_c_526336]
-
- mov dword [rsp], %2/4
-
-.loopH:
- mov r4d, (%1/4)
-.loopW:
- PROCESS_CHROMA_SP_W4_4R
-
- paddd m0, m6
- paddd m1, m6
- paddd m2, m6
- paddd m3, m6
-
- psrad m0, 12
- psrad m1, 12
- psrad m2, 12
- psrad m3, 12
-
- packssdw m0, m1
- packssdw m2, m3
-
- packuswb m0, m2
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
- lea r5, [r2 + 2 * r3]
- pextrd [r5], m0, 2
- pextrd [r5 + r3], m0, 3
-
- lea r5, [4 * r1 - 2 * 4]
- sub r0, r5
- add r2, 4
-
- dec r4d
- jnz .loopW
-
- lea r0, [r0 + 4 * r1 - 2 * %1]
- lea r2, [r2 + 4 * r3 - %1]
-
- dec dword [rsp]
- jnz .loopH
-
- RET
-%endmacro
-
- FILTER_VER_CHROMA_SP 4, 4
- FILTER_VER_CHROMA_SP 4, 8
- FILTER_VER_CHROMA_SP 16, 16
- FILTER_VER_CHROMA_SP 16, 8
- FILTER_VER_CHROMA_SP 16, 12
- FILTER_VER_CHROMA_SP 12, 16
- FILTER_VER_CHROMA_SP 16, 4
- FILTER_VER_CHROMA_SP 4, 16
- FILTER_VER_CHROMA_SP 32, 32
- FILTER_VER_CHROMA_SP 32, 16
- FILTER_VER_CHROMA_SP 16, 32
- FILTER_VER_CHROMA_SP 32, 24
- FILTER_VER_CHROMA_SP 24, 32
- FILTER_VER_CHROMA_SP 32, 8
-
- FILTER_VER_CHROMA_SP 16, 24
- FILTER_VER_CHROMA_SP 16, 64
- FILTER_VER_CHROMA_SP 12, 32
- FILTER_VER_CHROMA_SP 4, 32
- FILTER_VER_CHROMA_SP 32, 64
- FILTER_VER_CHROMA_SP 32, 48
- FILTER_VER_CHROMA_SP 24, 64
-
- FILTER_VER_CHROMA_SP 64, 64
- FILTER_VER_CHROMA_SP 64, 32
- FILTER_VER_CHROMA_SP 64, 48
- FILTER_VER_CHROMA_SP 48, 64
- FILTER_VER_CHROMA_SP 64, 16
-
-
-%macro PROCESS_CHROMA_SP_W2_4R 1
- movd m0, [r0]
- movd m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
-
- lea r0, [r0 + 2 * r1]
- movd m2, [r0]
- punpcklwd m1, m2 ;m1=[1 2]
- punpcklqdq m0, m1 ;m0=[0 1 1 2]
- pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
-
- movd m1, [r0 + r1]
- punpcklwd m2, m1 ;m2=[2 3]
-
- lea r0, [r0 + 2 * r1]
- movd m3, [r0]
- punpcklwd m1, m3 ;m2=[3 4]
- punpcklqdq m2, m1 ;m2=[2 3 3 4]
-
- pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
- pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
- paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
-
- movd m1, [r0 + r1]
- punpcklwd m3, m1 ;m3=[4 5]
-
- movd m4, [r0 + 2 * r1]
- punpcklwd m1, m4 ;m1=[5 6]
- punpcklqdq m3, m1 ;m2=[4 5 5 6]
- pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
- paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
-%endmacro
-
-;-------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SP_W2_4R 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
-
- add r1d, r1d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- mova m5, [tab_c_526336]
-
- mov r4d, (%2/4)
-
-.loopH:
- PROCESS_CHROMA_SP_W2_4R r5
-
- paddd m0, m5
- paddd m2, m5
-
- psrad m0, 12
- psrad m2, 12
-
- packssdw m0, m2
- packuswb m0, m0
-
- pextrw [r2], m0, 0
- pextrw [r2 + r3], m0, 1
- lea r2, [r2 + 2 * r3]
- pextrw [r2], m0, 2
- pextrw [r2 + r3], m0, 3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SP_W2_4R 2, 4
-FILTER_VER_CHROMA_SP_W2_4R 2, 8
-
-FILTER_VER_CHROMA_SP_W2_4R 2, 16
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
-
- add r1d, r1d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- mova m4, [tab_c_526336]
-
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m2, [r0]
- punpcklwd m1, m2 ;m1=[1 2]
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
-
- movq m3, [r0 + r1]
- punpcklwd m2, m3 ;m4=[2 3]
- pmaddwd m2, [r5 + 1 * 16]
- paddd m0, m2 ;m0=[0+1+2+3] Row1 done
- paddd m0, m4
- psrad m0, 12
-
- movq m2, [r0 + 2 * r1]
- punpcklwd m3, m2 ;m5=[3 4]
- pmaddwd m3, [r5 + 1 * 16]
- paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
- paddd m1, m4
- psrad m1, 12
-
- packssdw m0, m1
- packuswb m0, m0
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
-
- RET
-
-;-------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SP_W6_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
-
- add r1d, r1d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_ChromaCoeffV + r4]
-%endif
-
- mova m6, [tab_c_526336]
-
- mov r4d, %2/4
-
-.loopH:
- PROCESS_CHROMA_SP_W4_4R
-
- paddd m0, m6
- paddd m1, m6
- paddd m2, m6
- paddd m3, m6
-
- psrad m0, 12
- psrad m1, 12
- psrad m2, 12
- psrad m3, 12
-
- packssdw m0, m1
- packssdw m2, m3
-
- packuswb m0, m2
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
- lea r5, [r2 + 2 * r3]
- pextrd [r5], m0, 2
- pextrd [r5 + r3], m0, 3
-
- lea r5, [4 * r1 - 2 * 4]
- sub r0, r5
- add r2, 4
-
- PROCESS_CHROMA_SP_W2_4R r6
-
- paddd m0, m6
- paddd m2, m6
-
- psrad m0, 12
- psrad m2, 12
-
- packssdw m0, m2
- packuswb m0, m0
-
- pextrw [r2], m0, 0
- pextrw [r2 + r3], m0, 1
- lea r2, [r2 + 2 * r3]
- pextrw [r2], m0, 2
- pextrw [r2 + r3], m0, 3
-
- sub r0, 2 * 4
- lea r2, [r2 + 2 * r3 - 4]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SP_W6_H4 6, 8
-
-FILTER_VER_CHROMA_SP_W6_H4 6, 16
-
-%macro PROCESS_CHROMA_SP_W8_2R 0
- movu m1, [r0]
- movu m3, [r0 + r1]
- punpcklwd m0, m1, m3
- pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
- punpckhwd m1, m3
- pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
-
- movu m4, [r0 + 2 * r1]
- punpcklwd m2, m3, m4
- pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
- punpckhwd m3, m4
- pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
-
- lea r0, [r0 + 2 * r1]
- movu m5, [r0 + r1]
- punpcklwd m6, m4, m5
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
- paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
- punpckhwd m4, m5
- pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
- paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
-
- movu m4, [r0 + 2 * r1]
- punpcklwd m6, m5, m4
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
- paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
- punpckhwd m5, m4
- pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
- paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
-%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SP_W8_H2 2
-INIT_XMM sse2
-cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
-
- add r1d, r1d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- mova m7, [tab_c_526336]
-
- mov r4d, %2/2
-.loopH:
- PROCESS_CHROMA_SP_W8_2R
-
- paddd m0, m7
- paddd m1, m7
- paddd m2, m7
- paddd m3, m7
-
- psrad m0, 12
- psrad m1, 12
- psrad m2, 12
- psrad m3, 12
-
- packssdw m0, m1
- packssdw m2, m3
-
- packuswb m0, m2
-
- movlps [r2], m0
- movhps [r2 + r3], m0
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SP_W8_H2 8, 2
-FILTER_VER_CHROMA_SP_W8_H2 8, 4
-FILTER_VER_CHROMA_SP_W8_H2 8, 6
-FILTER_VER_CHROMA_SP_W8_H2 8, 8
-FILTER_VER_CHROMA_SP_W8_H2 8, 16
-FILTER_VER_CHROMA_SP_W8_H2 8, 32
-
-FILTER_VER_CHROMA_SP_W8_H2 8, 12
-FILTER_VER_CHROMA_SP_W8_H2 8, 64
-
-
-;-----------------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;-----------------------------------------------------------------------------------------------------------------------------
-%macro FILTER_HORIZ_CHROMA_2xN 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
-%define coef2 m3
-%define Tm0 m2
-%define t1 m1
-%define t0 m0
-
- dec srcq
- mov r4d, r4m
- add dststrided, dststrided
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd coef2, [r6 + r4 * 4]
-%else
- movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufd coef2, coef2, 0
- mova t1, [pw_2000]
- mova Tm0, [tab_Tm]
-
- mov r4d, %2
- cmp r5m, byte 0
- je .loopH
- sub srcq, srcstrideq
- add r4d, 3
-
-.loopH:
- movh t0, [srcq]
- pshufb t0, t0, Tm0
- pmaddubsw t0, coef2
- phaddw t0, t0
- psubw t0, t1
- movd [dstq], t0
-
- lea srcq, [srcq + srcstrideq]
- lea dstq, [dstq + dststrideq]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_HORIZ_CHROMA_2xN 2, 4
-FILTER_HORIZ_CHROMA_2xN 2, 8
-
-FILTER_HORIZ_CHROMA_2xN 2, 16
-
-;-----------------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;-----------------------------------------------------------------------------------------------------------------------------
-%macro FILTER_HORIZ_CHROMA_4xN 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
-%define coef2 m3
-%define Tm0 m2
-%define t1 m1
-%define t0 m0
-
- dec srcq
- mov r4d, r4m
- add dststrided, dststrided
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd coef2, [r6 + r4 * 4]
-%else
- movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufd coef2, coef2, 0
- mova t1, [pw_2000]
- mova Tm0, [tab_Tm]
-
- mov r4d, %2
- cmp r5m, byte 0
- je .loopH
- sub srcq, srcstrideq
- add r4d, 3
-
-.loopH:
- movh t0, [srcq]
- pshufb t0, t0, Tm0
- pmaddubsw t0, coef2
- phaddw t0, t0
- psubw t0, t1
- movlps [dstq], t0
-
- lea srcq, [srcq + srcstrideq]
- lea dstq, [dstq + dststrideq]
-
- dec r4d
- jnz .loopH
- RET
-%endmacro
-
-FILTER_HORIZ_CHROMA_4xN 4, 2
-FILTER_HORIZ_CHROMA_4xN 4, 4
-FILTER_HORIZ_CHROMA_4xN 4, 8
-FILTER_HORIZ_CHROMA_4xN 4, 16
-
-FILTER_HORIZ_CHROMA_4xN 4, 32
-
-%macro PROCESS_CHROMA_W6 3
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- psubw %2, %3
- movh [dstq], %2
- pshufd %2, %2, 2
- movd [dstq + 8], %2
-%endmacro
-
-%macro PROCESS_CHROMA_W12 3
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- psubw %2, %3
- movu [dstq], %2
- movu %1, [srcq + 8]
- pshufb %1, %1, Tm0
- pmaddubsw %1, coef2
- phaddw %1, %1
- psubw %1, %3
- movh [dstq + 16], %1
-%endmacro
-
-;-----------------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;-----------------------------------------------------------------------------------------------------------------------------
-%macro FILTER_HORIZ_CHROMA 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
-%define coef2 m5
-%define Tm0 m4
-%define Tm1 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
- dec srcq
- mov r4d, r4m
- add dststrided, dststrided
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd coef2, [r6 + r4 * 4]
-%else
- movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufd coef2, coef2, 0
- mova t2, [pw_2000]
- mova Tm0, [tab_Tm]
- mova Tm1, [tab_Tm + 16]
-
- mov r4d, %2
- cmp r5m, byte 0
- je .loopH
- sub srcq, srcstrideq
- add r4d, 3
-
-.loopH:
- PROCESS_CHROMA_W%1 t0, t1, t2
- add srcq, srcstrideq
- add dstq, dststrideq
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_HORIZ_CHROMA 6, 8
-FILTER_HORIZ_CHROMA 12, 16
-
-FILTER_HORIZ_CHROMA 6, 16
-FILTER_HORIZ_CHROMA 12, 32
-
-%macro PROCESS_CHROMA_W8 3
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- psubw %2, %3
- movu [dstq], %2
-%endmacro
-
-;-----------------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;-----------------------------------------------------------------------------------------------------------------------------
-%macro FILTER_HORIZ_CHROMA_8xN 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
-%define coef2 m5
-%define Tm0 m4
-%define Tm1 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
- dec srcq
- mov r4d, r4m
- add dststrided, dststrided
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd coef2, [r6 + r4 * 4]
-%else
- movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufd coef2, coef2, 0
- mova t2, [pw_2000]
- mova Tm0, [tab_Tm]
- mova Tm1, [tab_Tm + 16]
-
- mov r4d, %2
- cmp r5m, byte 0
- je .loopH
- sub srcq, srcstrideq
- add r4d, 3
-
-.loopH:
- PROCESS_CHROMA_W8 t0, t1, t2
- add srcq, srcstrideq
- add dstq, dststrideq
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_HORIZ_CHROMA_8xN 8, 2
-FILTER_HORIZ_CHROMA_8xN 8, 4
-FILTER_HORIZ_CHROMA_8xN 8, 6
-FILTER_HORIZ_CHROMA_8xN 8, 8
-FILTER_HORIZ_CHROMA_8xN 8, 16
-FILTER_HORIZ_CHROMA_8xN 8, 32
-
-FILTER_HORIZ_CHROMA_8xN 8, 12
-FILTER_HORIZ_CHROMA_8xN 8, 64
-
-%macro PROCESS_CHROMA_W16 4
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- psubw %2, %3
- psubw %4, %3
- movu [dstq], %2
- movu [dstq + 16], %4
-%endmacro
-
-%macro PROCESS_CHROMA_W24 4
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- psubw %2, %3
- psubw %4, %3
- movu [dstq], %2
- movu [dstq + 16], %4
- movu %1, [srcq + 16]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- psubw %2, %3
- movu [dstq + 32], %2
-%endmacro
-
-%macro PROCESS_CHROMA_W32 4
- movu %1, [srcq]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- psubw %2, %3
- psubw %4, %3
- movu [dstq], %2
- movu [dstq + 16], %4
- movu %1, [srcq + 16]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + 24]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- psubw %2, %3
- psubw %4, %3
- movu [dstq + 32], %2
- movu [dstq + 48], %4
-%endmacro
-
-%macro PROCESS_CHROMA_W16o 5
- movu %1, [srcq + %5]
- pshufb %2, %1, Tm0
- pmaddubsw %2, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %2, %1
- movu %1, [srcq + %5 + 8]
- pshufb %4, %1, Tm0
- pmaddubsw %4, coef2
- pshufb %1, %1, Tm1
- pmaddubsw %1, coef2
- phaddw %4, %1
- psubw %2, %3
- psubw %4, %3
- movu [dstq + %5 * 2], %2
- movu [dstq + %5 * 2 + 16], %4
-%endmacro
-
-%macro PROCESS_CHROMA_W48 4
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
-%endmacro
-
-%macro PROCESS_CHROMA_W64 4
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
-%endmacro
-
-;------------------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;------------------------------------------------------------------------------------------------------------------------------
-%macro FILTER_HORIZ_CHROMA_WxN 2
-INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
-%define coef2 m6
-%define Tm0 m5
-%define Tm1 m4
-%define t3 m3
-%define t2 m2
-%define t1 m1
-%define t0 m0
-
- dec srcq
- mov r4d, r4m
- add dststrided, dststrided
-
-%ifdef PIC
- lea r6, [tab_ChromaCoeff]
- movd coef2, [r6 + r4 * 4]
-%else
- movd coef2, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufd coef2, coef2, 0
- mova t2, [pw_2000]
- mova Tm0, [tab_Tm]
- mova Tm1, [tab_Tm + 16]
-
- mov r4d, %2
- cmp r5m, byte 0
- je .loopH
- sub srcq, srcstrideq
- add r4d, 3
-
-.loopH:
- PROCESS_CHROMA_W%1 t0, t1, t2, t3
- add srcq, srcstrideq
- add dstq, dststrideq
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_HORIZ_CHROMA_WxN 16, 4
-FILTER_HORIZ_CHROMA_WxN 16, 8
-FILTER_HORIZ_CHROMA_WxN 16, 12
-FILTER_HORIZ_CHROMA_WxN 16, 16
-FILTER_HORIZ_CHROMA_WxN 16, 32
-FILTER_HORIZ_CHROMA_WxN 24, 32
-FILTER_HORIZ_CHROMA_WxN 32, 8
-FILTER_HORIZ_CHROMA_WxN 32, 16
-FILTER_HORIZ_CHROMA_WxN 32, 24
-FILTER_HORIZ_CHROMA_WxN 32, 32
-
-FILTER_HORIZ_CHROMA_WxN 16, 24
-FILTER_HORIZ_CHROMA_WxN 16, 64
-FILTER_HORIZ_CHROMA_WxN 24, 64
-FILTER_HORIZ_CHROMA_WxN 32, 48
-FILTER_HORIZ_CHROMA_WxN 32, 64
-
-FILTER_HORIZ_CHROMA_WxN 64, 64
-FILTER_HORIZ_CHROMA_WxN 64, 32
-FILTER_HORIZ_CHROMA_WxN 64, 48
-FILTER_HORIZ_CHROMA_WxN 48, 64
-FILTER_HORIZ_CHROMA_WxN 64, 16
-
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W16n 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m1, m0, [tab_Vm]
- pshufb m0, [tab_Vm + 16]
- mov r4d, %2/2
-
-.loop:
-
- mov r6d, %1/16
-
-.loopW:
-
- movu m2, [r0]
- movu m3, [r0 + r1]
-
- punpcklbw m4, m2, m3
- punpckhbw m2, m3
-
- pmaddubsw m4, m1
- pmaddubsw m2, m1
-
- lea r5, [r0 + 2 * r1]
- movu m5, [r5]
- movu m7, [r5 + r1]
-
- punpcklbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m4, m6
-
- punpckhbw m6, m5, m7
- pmaddubsw m6, m0
- paddw m2, m6
-
- mova m6, [pw_2000]
-
- psubw m4, m6
- psubw m2, m6
-
- movu [r2], m4
- movu [r2 + 16], m2
-
- punpcklbw m4, m3, m5
- punpckhbw m3, m5
-
- pmaddubsw m4, m1
- pmaddubsw m3, m1
-
- movu m5, [r5 + 2 * r1]
-
- punpcklbw m2, m7, m5
- punpckhbw m7, m5
-
- pmaddubsw m2, m0
- pmaddubsw m7, m0
-
- paddw m4, m2
- paddw m3, m7
-
- psubw m4, m6
- psubw m3, m6
-
- movu [r2 + r3], m4
- movu [r2 + r3 + 16], m3
-
- add r0, 16
- add r2, 32
- dec r6d
- jnz .loopW
-
- lea r0, [r0 + r1 * 2 - %1]
- lea r2, [r2 + r3 * 2 - %1 * 2]
-
- dec r4d
- jnz .loop
- RET
-%endmacro
-
-FILTER_V_PS_W16n 64, 64
-FILTER_V_PS_W16n 64, 32
-FILTER_V_PS_W16n 64, 48
-FILTER_V_PS_W16n 48, 64
-FILTER_V_PS_W16n 64, 16
-
-
-;------------------------------------------------------------------------------------------------------------
-;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m0, [tab_Cm]
-
- lea r5, [3 * r1]
-
- movd m2, [r0]
- movd m3, [r0 + r1]
- movd m4, [r0 + 2 * r1]
- movd m5, [r0 + r5]
-
- punpcklbw m2, m3
- punpcklbw m6, m4, m5
- punpcklbw m2, m6
-
- pmaddubsw m2, m0
-
- lea r0, [r0 + 4 * r1]
- movd m6, [r0]
-
- punpcklbw m3, m4
- punpcklbw m1, m5, m6
- punpcklbw m3, m1
-
- pmaddubsw m3, m0
- phaddw m2, m3
-
- mova m1, [pw_2000]
-
- psubw m2, m1
-
- movd [r2], m2
- pextrd [r2 + r3], m2, 2
-
- movd m2, [r0 + r1]
-
- punpcklbw m4, m5
- punpcklbw m3, m6, m2
- punpcklbw m4, m3
-
- pmaddubsw m4, m0
-
- movd m3, [r0 + 2 * r1]
-
- punpcklbw m5, m6
- punpcklbw m2, m3
- punpcklbw m5, m2
-
- pmaddubsw m5, m0
- phaddw m4, m5
- psubw m4, m1
-
- lea r2, [r2 + 2 * r3]
- movd [r2], m4
- pextrd [r2 + r3], m4, 2
-
- RET
-
-;-------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_V_PS_W2 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
-
- mov r4d, r4m
- sub r0, r1
- add r3d, r3d
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeff]
- movd m0, [r5 + r4 * 4]
-%else
- movd m0, [tab_ChromaCoeff + r4 * 4]
-%endif
-
- pshufb m0, [tab_Cm]
-
- mova m1, [pw_2000]
- lea r5, [3 * r1]
- mov r4d, %2/4
-.loop:
- movd m2, [r0]
- movd m3, [r0 + r1]
- movd m4, [r0 + 2 * r1]
- movd m5, [r0 + r5]
-
- punpcklbw m2, m3
- punpcklbw m6, m4, m5
- punpcklbw m2, m6
-
- pmaddubsw m2, m0
-
- lea r0, [r0 + 4 * r1]
- movd m6, [r0]
-
- punpcklbw m3, m4
- punpcklbw m7, m5, m6
- punpcklbw m3, m7
-
- pmaddubsw m3, m0
-
- phaddw m2, m3
- psubw m2, m1
-
-
- movd [r2], m2
- pshufd m2, m2, 2
- movd [r2 + r3], m2
-
- movd m2, [r0 + r1]
-
- punpcklbw m4, m5
- punpcklbw m3, m6, m2
- punpcklbw m4, m3
-
- pmaddubsw m4, m0
-
- movd m3, [r0 + 2 * r1]
-
- punpcklbw m5, m6
- punpcklbw m2, m3
- punpcklbw m5, m2
-
- pmaddubsw m5, m0
-
- phaddw m4, m5
-
- psubw m4, m1
-
- lea r2, [r2 + 2 * r3]
- movd [r2], m4
- pshufd m4 , m4 ,2
- movd [r2 + r3], m4
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
-
-RET
-%endmacro
-
-FILTER_V_PS_W2 2, 8
-
-FILTER_V_PS_W2 2, 16
-
-;-----------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SS 2
-INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
-
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_ChromaCoeffV + r4]
-%endif
-
- mov dword [rsp], %2/4
-
-.loopH:
- mov r4d, (%1/4)
-.loopW:
- PROCESS_CHROMA_SP_W4_4R
-
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
- movlps [r2], m0
- movhps [r2 + r3], m0
- lea r5, [r2 + 2 * r3]
- movlps [r5], m2
- movhps [r5 + r3], m2
-
- lea r5, [4 * r1 - 2 * 4]
- sub r0, r5
- add r2, 2 * 4
-
- dec r4d
- jnz .loopW
-
- lea r0, [r0 + 4 * r1 - 2 * %1]
- lea r2, [r2 + 4 * r3 - 2 * %1]
-
- dec dword [rsp]
- jnz .loopH
-
- RET
-%endmacro
-
- FILTER_VER_CHROMA_SS 4, 4
- FILTER_VER_CHROMA_SS 4, 8
- FILTER_VER_CHROMA_SS 16, 16
- FILTER_VER_CHROMA_SS 16, 8
- FILTER_VER_CHROMA_SS 16, 12
- FILTER_VER_CHROMA_SS 12, 16
- FILTER_VER_CHROMA_SS 16, 4
- FILTER_VER_CHROMA_SS 4, 16
- FILTER_VER_CHROMA_SS 32, 32
- FILTER_VER_CHROMA_SS 32, 16
- FILTER_VER_CHROMA_SS 16, 32
- FILTER_VER_CHROMA_SS 32, 24
- FILTER_VER_CHROMA_SS 24, 32
- FILTER_VER_CHROMA_SS 32, 8
-
- FILTER_VER_CHROMA_SS 16, 24
- FILTER_VER_CHROMA_SS 12, 32
- FILTER_VER_CHROMA_SS 4, 32
- FILTER_VER_CHROMA_SS 32, 64
- FILTER_VER_CHROMA_SS 16, 64
- FILTER_VER_CHROMA_SS 32, 48
- FILTER_VER_CHROMA_SS 24, 64
-
- FILTER_VER_CHROMA_SS 64, 64
- FILTER_VER_CHROMA_SS 64, 32
- FILTER_VER_CHROMA_SS 64, 48
- FILTER_VER_CHROMA_SS 48, 64
- FILTER_VER_CHROMA_SS 64, 16
-
-
-;---------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SS_W2_4R 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
-
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- mov r4d, (%2/4)
-
-.loopH:
- PROCESS_CHROMA_SP_W2_4R r5
-
- psrad m0, 6
- psrad m2, 6
-
- packssdw m0, m2
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
- lea r2, [r2 + 2 * r3]
- pextrd [r2], m0, 2
- pextrd [r2 + r3], m0, 3
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SS_W2_4R 2, 4
-FILTER_VER_CHROMA_SS_W2_4R 2, 8
-
-FILTER_VER_CHROMA_SS_W2_4R 2, 16
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
-
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m2, [r0]
- punpcklwd m1, m2 ;m1=[1 2]
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
-
- movq m3, [r0 + r1]
- punpcklwd m2, m3 ;m4=[2 3]
- pmaddwd m2, [r5 + 1 * 16]
- paddd m0, m2 ;m0=[0+1+2+3] Row1 done
- psrad m0, 6
-
- movq m2, [r0 + 2 * r1]
- punpcklwd m3, m2 ;m5=[3 4]
- pmaddwd m3, [r5 + 1 * 16]
- paddd m1, m3 ;m1=[1+2+3+4] Row2 done
- psrad m1, 6
-
- packssdw m0, m1
-
- movlps [r2], m0
- movhps [r2 + r3], m0
-
- RET
-
-;-------------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SS_W6_H4 2
-INIT_XMM sse4
-cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
-
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_ChromaCoeffV + r4]
-%endif
-
- mov r4d, %2/4
-
-.loopH:
- PROCESS_CHROMA_SP_W4_4R
-
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
- movlps [r2], m0
- movhps [r2 + r3], m0
- lea r5, [r2 + 2 * r3]
- movlps [r5], m2
- movhps [r5 + r3], m2
-
- lea r5, [4 * r1 - 2 * 4]
- sub r0, r5
- add r2, 2 * 4
-
- PROCESS_CHROMA_SP_W2_4R r6
-
- psrad m0, 6
- psrad m2, 6
-
- packssdw m0, m2
-
- movd [r2], m0
- pextrd [r2 + r3], m0, 1
- lea r2, [r2 + 2 * r3]
- pextrd [r2], m0, 2
- pextrd [r2 + r3], m0, 3
-
- sub r0, 2 * 4
- lea r2, [r2 + 2 * r3 - 2 * 4]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SS_W6_H4 6, 8
-
-FILTER_VER_CHROMA_SS_W6_H4 6, 16
-
-
-;----------------------------------------------------------------------------------------------------------------
-; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;----------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_CHROMA_SS_W8_H2 2
-INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
-
- add r1d, r1d
- add r3d, r3d
- sub r0, r1
- shl r4d, 5
-
-%ifdef PIC
- lea r5, [tab_ChromaCoeffV]
- lea r5, [r5 + r4]
-%else
- lea r5, [tab_ChromaCoeffV + r4]
-%endif
-
- mov r4d, %2/2
-.loopH:
- PROCESS_CHROMA_SP_W8_2R
-
- psrad m0, 6
- psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
- movu [r2], m0
- movu [r2 + r3], m2
-
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loopH
-
- RET
-%endmacro
-
-FILTER_VER_CHROMA_SS_W8_H2 8, 2
-FILTER_VER_CHROMA_SS_W8_H2 8, 4
-FILTER_VER_CHROMA_SS_W8_H2 8, 6
-FILTER_VER_CHROMA_SS_W8_H2 8, 8
-FILTER_VER_CHROMA_SS_W8_H2 8, 16
-FILTER_VER_CHROMA_SS_W8_H2 8, 32
-
-FILTER_VER_CHROMA_SS_W8_H2 8, 12
-FILTER_VER_CHROMA_SS_W8_H2 8, 64
-
-;-----------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_SS 2
-INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
-
- add r1d, r1d
- add r3d, r3d
- lea r5, [3 * r1]
- sub r0, r5
- shl r4d, 6
-
-%ifdef PIC
- lea r5, [tab_LumaCoeffV]
- lea r6, [r5 + r4]
-%else
- lea r6, [tab_LumaCoeffV + r4]
-%endif
-
- mov dword [rsp], %2/4
-.loopH:
- mov r4d, (%1/4)
-.loopW:
- movq m0, [r0]
- movq m1, [r0 + r1]
- punpcklwd m0, m1 ;m0=[0 1]
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m1, m4 ;m1=[1 2]
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[2 3]
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
- pmaddwd m4, [r6 + 1 * 16]
- paddd m0, m4 ;m0=[0+1+2+3] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[3 4]
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
- pmaddwd m5, [r6 + 1 * 16]
- paddd m1, m5 ;m1 = [1+2+3+4] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[4 5]
- pmaddwd m6, m4, [r6 + 1 * 16]
- paddd m2, m6 ;m2=[2+3+4+5] Row3
- pmaddwd m4, [r6 + 2 * 16]
- paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[5 6]
- pmaddwd m6, m5, [r6 + 1 * 16]
- paddd m3, m6 ;m3=[3+4+5+6] Row4
- pmaddwd m5, [r6 + 2 * 16]
- paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[6 7]
- pmaddwd m6, m4, [r6 + 2 * 16]
- paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
- pmaddwd m4, [r6 + 3 * 16]
- paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
- psrad m0, 6
-
- lea r0, [r0 + 2 * r1]
- movq m4, [r0]
- punpcklwd m5, m4 ;m5=[7 8]
- pmaddwd m6, m5, [r6 + 2 * 16]
- paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
- pmaddwd m5, [r6 + 3 * 16]
- paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
- psrad m1, 6
-
- packssdw m0, m1
-
- movlps [r2], m0
- movhps [r2 + r3], m0
-
- movq m5, [r0 + r1]
- punpcklwd m4, m5 ;m4=[8 9]
- pmaddwd m4, [r6 + 3 * 16]
- paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
- psrad m2, 6
-
- movq m4, [r0 + 2 * r1]
- punpcklwd m5, m4 ;m5=[9 10]
- pmaddwd m5, [r6 + 3 * 16]
- paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
- psrad m3, 6
-
- packssdw m2, m3
-
- movlps [r2 + 2 * r3], m2
- lea r5, [3 * r3]
- movhps [r2 + r5], m2
-
- lea r5, [8 * r1 - 2 * 4]
- sub r0, r5
- add r2, 2 * 4
-
- dec r4d
- jnz .loopW
-
- lea r0, [r0 + 4 * r1 - 2 * %1]
- lea r2, [r2 + 4 * r3 - 2 * %1]
-
- dec dword [rsp]
- jnz .loopH
-
- RET
-%endmacro
-
- FILTER_VER_LUMA_SS 4, 4
- FILTER_VER_LUMA_SS 8, 8
- FILTER_VER_LUMA_SS 8, 4
- FILTER_VER_LUMA_SS 4, 8
- FILTER_VER_LUMA_SS 16, 16
- FILTER_VER_LUMA_SS 16, 8
- FILTER_VER_LUMA_SS 8, 16
- FILTER_VER_LUMA_SS 16, 12
- FILTER_VER_LUMA_SS 12, 16
- FILTER_VER_LUMA_SS 16, 4
- FILTER_VER_LUMA_SS 4, 16
- FILTER_VER_LUMA_SS 32, 32
- FILTER_VER_LUMA_SS 32, 16
- FILTER_VER_LUMA_SS 16, 32
- FILTER_VER_LUMA_SS 32, 24
- FILTER_VER_LUMA_SS 24, 32
- FILTER_VER_LUMA_SS 32, 8
- FILTER_VER_LUMA_SS 8, 32
- FILTER_VER_LUMA_SS 64, 64
- FILTER_VER_LUMA_SS 64, 32
- FILTER_VER_LUMA_SS 32, 64
- FILTER_VER_LUMA_SS 64, 48
- FILTER_VER_LUMA_SS 48, 64
- FILTER_VER_LUMA_SS 64, 16
- FILTER_VER_LUMA_SS 16, 64
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_PP_W8 2
+INIT_XMM sse4
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7
+ mov r4d, r4m
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ movh m3, [r5 + r4 * 8]
+%else
+ movh m3, [tab_LumaCoeff + r4 * 8]
+%endif
+ pshufd m0, m3, 0 ; m0 = coeff-L
+ pshufd m1, m3, 0x55 ; m1 = coeff-H
+ lea r5, [tab_Tm] ; r5 = shuffle
+ mova m2, [pw_512] ; m2 = 512
+
+ mov r4d, %2
+.loopH:
+%assign x 0
+%rep %1 / 8
+ movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]
+ pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]
+ pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]
+ pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]
+ pmaddubsw m4, m0
+ pmaddubsw m6, m5, m1
+ pmaddubsw m5, m0
+ pmaddubsw m3, m1
+ paddw m4, m6
+ paddw m5, m3
+ phaddw m4, m5
+ pmulhrsw m4, m2
+ packuswb m4, m4
+ movh [r2 + x], m4
+%assign x x+8
+%endrep
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+%endmacro
+
+IPFILTER_LUMA_PP_W8 8, 4
+IPFILTER_LUMA_PP_W8 8, 8
+IPFILTER_LUMA_PP_W8 8, 16
+IPFILTER_LUMA_PP_W8 8, 32
+IPFILTER_LUMA_PP_W8 16, 4
+IPFILTER_LUMA_PP_W8 16, 8
+IPFILTER_LUMA_PP_W8 16, 12
+IPFILTER_LUMA_PP_W8 16, 16
+IPFILTER_LUMA_PP_W8 16, 32
+IPFILTER_LUMA_PP_W8 16, 64
+IPFILTER_LUMA_PP_W8 24, 32
+IPFILTER_LUMA_PP_W8 32, 8
+IPFILTER_LUMA_PP_W8 32, 16
+IPFILTER_LUMA_PP_W8 32, 24
+IPFILTER_LUMA_PP_W8 32, 32
+IPFILTER_LUMA_PP_W8 32, 64
+IPFILTER_LUMA_PP_W8 48, 64
+IPFILTER_LUMA_PP_W8 64, 16
+IPFILTER_LUMA_PP_W8 64, 32
+IPFILTER_LUMA_PP_W8 64, 48
+IPFILTER_LUMA_PP_W8 64, 64
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA 4, 4, ps
+ IPFILTER_LUMA 8, 8, ps
+ IPFILTER_LUMA 8, 4, ps
+ IPFILTER_LUMA 4, 8, ps
+ IPFILTER_LUMA 16, 16, ps
+ IPFILTER_LUMA 16, 8, ps
+ IPFILTER_LUMA 8, 16, ps
+ IPFILTER_LUMA 16, 12, ps
+ IPFILTER_LUMA 12, 16, ps
+ IPFILTER_LUMA 16, 4, ps
+ IPFILTER_LUMA 4, 16, ps
+ IPFILTER_LUMA 32, 32, ps
+ IPFILTER_LUMA 32, 16, ps
+ IPFILTER_LUMA 16, 32, ps
+ IPFILTER_LUMA 32, 24, ps
+ IPFILTER_LUMA 24, 32, ps
+ IPFILTER_LUMA 32, 8, ps
+ IPFILTER_LUMA 8, 32, ps
+ IPFILTER_LUMA 64, 64, ps
+ IPFILTER_LUMA 64, 32, ps
+ IPFILTER_LUMA 32, 64, ps
+ IPFILTER_LUMA 64, 48, ps
+ IPFILTER_LUMA 48, 64, ps
+ IPFILTER_LUMA 64, 16, ps
+ IPFILTER_LUMA 16, 64, ps
+
+;-----------------------------------------------------------------------------
+; Interpolate HV
+;-----------------------------------------------------------------------------
+%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
+ mova %5, [r0 + (%6 + 0) * 16]
+ mova %1, [r0 + (%6 + 1) * 16]
+ mova %2, [r0 + (%6 + 2) * 16]
+ punpcklwd %3, %5, %1
+ punpckhwd %5, %1
+ pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
+ pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
+ punpcklwd %4, %1, %2
+ punpckhwd %1, %2
+ pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
+ pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
+%endmacro ; FILTER_HV8_START
+
+%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
+ mova %8, [r0 + (%9 + 0) * 16]
+ mova %1, [r0 + (%9 + 1) * 16]
+ punpcklwd %7, %2, %8
+ punpckhwd %2, %8
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %2, [r5 + %10 * 16]
+ paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
+ paddd %5, %2 ; R0 = H[0+1+2+3]
+ punpcklwd %7, %8, %1
+ punpckhwd %8, %1
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %8, [r5 + %10 * 16]
+ paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
+ paddd %6, %8 ; R1 = H[1+2+3+4]
+%endmacro ; FILTER_HV8_MID
+
+; Round and Saturate
+%macro FILTER_HV8_END 4 ; output in [1, 3]
+ paddd %1, [tab_c_526336]
+ paddd %2, [tab_c_526336]
+ paddd %3, [tab_c_526336]
+ paddd %4, [tab_c_526336]
+ psrad %1, 12
+ psrad %2, 12
+ psrad %3, 12
+ psrad %4, 12
+ packssdw %1, %2
+ packssdw %3, %4
+
+ ; TODO: is merge better? I think this way is short dependency link
+ packuswb %1, %3
+%endmacro ; FILTER_HV8_END
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+%define coef m7
+%define stk_buf rsp
+
+ mov r4d, r4m
+ mov r5d, r5m
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ movh coef, [r6 + r4 * 8]
+%else
+ movh coef, [tab_LumaCoeff + r4 * 8]
+%endif
+ punpcklqdq coef, coef
+
+ ; move to row -3
+ lea r6, [r1 + r1 * 2]
+ sub r0, r6
+
+ xor r6, r6
+ mov r4, rsp
+
+.loopH:
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]
+ psubw m1, [pw_2000]
+ mova [r4], m1
+
+ add r0, r1
+ add r4, 16
+ inc r6
+ cmp r6, 8+7
+ jnz .loopH
+
+ ; ready to phase V
+ ; Here all of mN is free
+
+ ; load coeff table
+ shl r5, 6
+ lea r6, [tab_LumaCoeffV]
+ lea r5, [r5 + r6]
+
+ ; load intermedia buffer
+ mov r0, stk_buf
+
+ ; register mapping
+ ; r0 - src
+ ; r5 - coeff
+ ; r6 - loop_i
+
+ ; let's go
+ xor r6, r6
+
+ ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
+.loopV:
+
+ FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
+ FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
+ FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
+ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
+ FILTER_HV8_END m3, m0, m4, m1
+
+ movh [r2], m3
+ movhps [r2 + r3], m3
+
+ lea r0, [r0 + 16 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ inc r6
+ cmp r6, 8/2
+ jnz .loopV
+
+ RET
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x4, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+lea r4, [r1 * 3]
+lea r5, [r0 + 4 * r1]
+pshufb m0, [tab_Cm]
+mova m1, [pw_512]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+movd m5, [r0 + r4]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r5]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+
+movd m7, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m2, m4
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+lea r2, [r2 + 2 * r3]
+pextrw [r2], m2, 4
+pextrw [r2 + r3], m2, 6
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W2_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [pw_512]
+
+mov r4d, %2
+lea r5, [3 * r1]
+
+.loop:
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+movd m5, [r0 + r5]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+lea r0, [r0 + 4 * r1]
+movd m6, [r0]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+
+movd m7, [r0 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r0 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m2, m4
+
+pextrw [r2], m2, 0
+pextrw [r2 + r3], m2, 2
+lea r2, [r2 + 2 * r3]
+pextrw [r2], m2, 4
+pextrw [r2 + r3], m2, 6
+
+lea r2, [r2 + 2 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W2_H4 2, 8
+
+FILTER_V4_W2_H4 2, 16
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 6
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+lea r5, [r0 + 2 * r1]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r5]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m1, m4, m5
+punpcklbw m2, m1
+
+pmaddubsw m2, m0
+
+movd m1, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m5, m1
+punpcklbw m3, m5
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, [pw_512]
+packuswb m2, m2
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+mova m1, [pw_512]
+lea r5, [r0 + 4 * r1]
+lea r4, [r1 * 3]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+movd m5, [r0 + r4]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+movd m6, [r5]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+
+movd m7, [r5 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r5 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+
+packuswb m2, m4
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+lea r2, [r2 + 2 * r3]
+pextrd [r2], m2, 2
+pextrd [r2 + r3], m2, 3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W4_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+mova m1, [pw_512]
+
+mov r4d, %2
+
+lea r5, [3 * r1]
+
+.loop:
+movd m2, [r0]
+movd m3, [r0 + r1]
+movd m4, [r0 + 2 * r1]
+movd m5, [r0 + r5]
+
+punpcklbw m2, m3
+punpcklbw m6, m4, m5
+punpcklbw m2, m6
+
+pmaddubsw m2, m0
+
+lea r0, [r0 + 4 * r1]
+movd m6, [r0]
+
+punpcklbw m3, m4
+punpcklbw m7, m5, m6
+punpcklbw m3, m7
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+pmulhrsw m2, m1
+
+movd m7, [r0 + r1]
+
+punpcklbw m4, m5
+punpcklbw m3, m6, m7
+punpcklbw m4, m3
+
+pmaddubsw m4, m0
+
+movd m3, [r0 + 2 * r1]
+
+punpcklbw m5, m6
+punpcklbw m7, m3
+punpcklbw m5, m7
+
+pmaddubsw m5, m0
+
+phaddw m4, m5
+
+pmulhrsw m4, m1
+packuswb m2, m4
+movd [r2], m2
+pextrd [r2 + r3], m2, 1
+lea r2, [r2 + 2 * r3]
+pextrd [r2], m2, 2
+pextrd [r2 + r3], m2, 3
+
+lea r2, [r2 + 2 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W4_H4 4, 8
+FILTER_V4_W4_H4 4, 16
+
+FILTER_V4_W4_H4 4, 32
+
+%macro FILTER_V4_W8_H2 0
+punpcklbw m1, m2
+punpcklbw m7, m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+%endmacro
+
+%macro FILTER_V4_W8_H3 0
+punpcklbw m2, m3
+punpcklbw m7, m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m7, m5
+
+paddw m2, m7
+
+pmulhrsw m2, m4
+packuswb m2, m2
+%endmacro
+
+%macro FILTER_V4_W8_H4 0
+punpcklbw m3, m0
+punpcklbw m7, m1, m2
+
+pmaddubsw m3, m6
+pmaddubsw m7, m5
+
+paddw m3, m7
+
+pmulhrsw m3, m4
+packuswb m3, m3
+%endmacro
+
+%macro FILTER_V4_W8_H5 0
+punpcklbw m0, m1
+punpcklbw m7, m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+%endmacro
+
+%macro FILTER_V4_W8_8x2 2
+FILTER_V4_W8 %1, %2
+movq m0, [r0 + 4 * r1]
+
+FILTER_V4_W8_H2
+
+movh [r2 + r3], m1
+%endmacro
+
+%macro FILTER_V4_W8_8x4 2
+FILTER_V4_W8_8x2 %1, %2
+;8x3
+lea r6, [r0 + 4 * r1]
+movq m1, [r6 + r1]
+
+FILTER_V4_W8_H3
+
+movh [r2 + 2 * r3], m2
+
+;8x4
+movq m2, [r6 + 2 * r1]
+
+FILTER_V4_W8_H4
+
+lea r5, [r2 + 2 * r3]
+movh [r5 + r3], m3
+%endmacro
+
+%macro FILTER_V4_W8_8x6 2
+FILTER_V4_W8_8x4 %1, %2
+;8x5
+lea r6, [r6 + 2 * r1]
+movq m3, [r6 + r1]
+
+FILTER_V4_W8_H5
+
+movh [r2 + 4 * r3], m0
+
+;8x6
+movq m0, [r0 + 8 * r1]
+
+FILTER_V4_W8_H2
+
+lea r5, [r2 + 4 * r3]
+movh [r5 + r3], m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+
+sub r0, r1
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+lea r5, [r0 + 2 * r1]
+movq m3, [r5 + r1]
+
+punpcklbw m0, m1
+punpcklbw m4, m2, m3
+
+%ifdef PIC
+lea r6, [tab_ChromaCoeff]
+movd m5, [r6 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pmaddubsw m0, m6
+
+pshufb m5, [tab_Vm + 16]
+pmaddubsw m4, m5
+
+paddw m0, m4
+
+mova m4, [pw_512]
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movh [r2], m0
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x2 8, 2
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x4 8, 4
+
+RET
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+FILTER_V4_W8_8x6 8, 6
+
+RET
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_4x2, 4, 6, 6
+
+mov r4d, r4m
+sub r0, r1
+add r3d, r3d
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m0, [tab_Cm]
+
+movd m2, [r0]
+movd m3, [r0 + r1]
+lea r5, [r0 + 2 * r1]
+movd m4, [r5]
+movd m5, [r5 + r1]
+
+punpcklbw m2, m3
+punpcklbw m1, m4, m5
+punpcklbw m2, m1
+
+pmaddubsw m2, m0
+
+movd m1, [r0 + 4 * r1]
+
+punpcklbw m3, m4
+punpcklbw m5, m1
+punpcklbw m3, m5
+
+pmaddubsw m3, m0
+
+phaddw m2, m3
+
+psubw m2, [pw_2000]
+movh [r2], m2
+movhps [r2 + r3], m2
+
+RET
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_4x4, 4, 6, 7
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ lea r4, [r1 * 3]
+ lea r5, [r0 + 4 * r1]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r4]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ movd m6, [r5]
+
+ punpcklbw m3, m4
+ punpcklbw m1, m5, m6
+ punpcklbw m3, m1
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ mova m1, [pw_2000]
+
+ psubw m2, m1
+ movh [r2], m2
+ movhps [r2 + r3], m2
+
+ movd m2, [r5 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m2
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r5 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m2, m3
+ punpcklbw m5, m2
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ psubw m4, m1
+ lea r2, [r2 + 2 * r3]
+ movh [r2], m4
+ movhps [r2 + r3], m4
+
+ RET
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W4_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ mova m1, [pw_2000]
+
+ mov r4d, %2/4
+ lea r5, [3 * r1]
+
+.loop:
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r5]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+
+ psubw m2, m1
+ movh [r2], m2
+ movhps [r2 + r3], m2
+
+ movd m2, [r0 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m2
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r0 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m2, m3
+ punpcklbw m5, m2
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ psubw m4, m1
+ lea r2, [r2 + 2 * r3]
+ movh [r2], m4
+ movhps [r2 + r3], m4
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W4_H4 4, 8
+FILTER_V_PS_W4_H4 4, 16
+
+FILTER_V_PS_W4_H4 4, 32
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W8_H8_H16_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pshufb m5, [tab_Vm + 16]
+ mova m4, [pw_2000]
+
+ mov r4d, %2/2
+ lea r5, [3 * r1]
+
+.loopH:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ movq m3, [r0 + r5]
+
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m2, m5
+
+ paddw m0, m2
+
+ psubw m0, m4
+ movu [r2], m0
+
+ movq m0, [r0 + 4 * r1]
+
+ punpcklbw m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m3, m5
+
+ paddw m1, m3
+ psubw m1, m4
+
+ movu [r2 + r3], m1
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_V_PS_W8_H8_H16_H2 8, 2
+FILTER_V_PS_W8_H8_H16_H2 8, 4
+FILTER_V_PS_W8_H8_H16_H2 8, 6
+
+FILTER_V_PS_W8_H8_H16_H2 8, 12
+FILTER_V_PS_W8_H8_H16_H2 8, 64
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W8_H8_H16_H32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pshufb m5, [tab_Vm + 16]
+ mova m4, [pw_2000]
+
+ mov r4d, %2/4
+ lea r5, [3 * r1]
+
+.loop:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ movq m3, [r0 + r5]
+
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m7, m2, m5
+
+ paddw m0, m7
+
+ psubw m0, m4
+ movu [r2], m0
+
+ lea r0, [r0 + 4 * r1]
+ movq m0, [r0]
+
+ punpcklbw m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m7, m3, m5
+
+ paddw m1, m7
+
+ psubw m1, m4
+ movu [r2 + r3], m1
+
+ movq m1, [r0 + r1]
+
+ punpcklbw m0, m1
+
+ pmaddubsw m2, m6
+ pmaddubsw m0, m5
+
+ paddw m2, m0
+
+ psubw m2, m4
+ lea r2, [r2 + 2 * r3]
+ movu [r2], m2
+
+ movq m2, [r0 + 2 * r1]
+
+ punpcklbw m1, m2
+
+ pmaddubsw m3, m6
+ pmaddubsw m1, m5
+
+ paddw m3, m1
+ psubw m3, m4
+
+ movu [r2 + r3], m3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W8_H8_H16_H32 8, 8
+FILTER_V_PS_W8_H8_H16_H32 8, 16
+FILTER_V_PS_W8_H8_H16_H32 8, 32
+
+;------------------------------------------------------------------------------------------------------------
+;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W6 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m5, [r5 + r4 * 4]
+%else
+ movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m6, m5, [tab_Vm]
+ pshufb m5, [tab_Vm + 16]
+ mova m4, [pw_2000]
+ lea r5, [3 * r1]
+ mov r4d, %2/4
+
+.loop:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ movq m2, [r0 + 2 * r1]
+ movq m3, [r0 + r5]
+
+ punpcklbw m0, m1
+ punpcklbw m1, m2
+ punpcklbw m2, m3
+
+ pmaddubsw m0, m6
+ pmaddubsw m7, m2, m5
+
+ paddw m0, m7
+ psubw m0, m4
+
+ movh [r2], m0
+ pshufd m0, m0, 2
+ movd [r2 + 8], m0
+
+ lea r0, [r0 + 4 * r1]
+ movq m0, [r0]
+ punpcklbw m3, m0
+
+ pmaddubsw m1, m6
+ pmaddubsw m7, m3, m5
+
+ paddw m1, m7
+ psubw m1, m4
+
+ movh [r2 + r3], m1
+ pshufd m1, m1, 2
+ movd [r2 + r3 + 8], m1
+
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+
+ pmaddubsw m2, m6
+ pmaddubsw m0, m5
+
+ paddw m2, m0
+ psubw m2, m4
+
+ lea r2,[r2 + 2 * r3]
+ movh [r2], m2
+ pshufd m2, m2, 2
+ movd [r2 + 8], m2
+
+ movq m2,[r0 + 2 * r1]
+ punpcklbw m1, m2
+
+ pmaddubsw m3, m6
+ pmaddubsw m1, m5
+
+ paddw m3, m1
+ psubw m3, m4
+
+ movh [r2 + r3], m3
+ pshufd m3, m3, 2
+ movd [r2 + r3 + 8], m3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W6 6, 8
+FILTER_V_PS_W6 6, 16
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W12 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2/2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m7, [r0 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movh [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m2, [r0 + 2 * r1]
+
+ punpcklbw m5, m7, m2
+ punpckhbw m7, m2
+
+ pmaddubsw m5, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m5
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movh [r2 + r3 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W12 12, 16
+FILTER_V_PS_W12 12, 32
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W16 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+ mov r4d, %2/2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0]
+ movu m7, [r0 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r0 + 2 * r1]
+
+ punpcklbw m2, m7, m5
+ punpckhbw m7, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m2
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movu [r2 + r3 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W16 16, 4
+FILTER_V_PS_W16 16, 8
+FILTER_V_PS_W16 16, 12
+FILTER_V_PS_W16 16, 16
+FILTER_V_PS_W16 16, 32
+
+FILTER_V_PS_W16 16, 24
+FILTER_V_PS_W16 16, 64
+
+;--------------------------------------------------------------------------------------------------------------
+;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_V4_PS_W24 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mov r4d, %2/2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+
+ movu m5, [r5]
+ movu m7, [r5 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m2, [r5 + 2 * r1]
+
+ punpcklbw m5, m7, m2
+ punpckhbw m7, m2
+
+ pmaddubsw m5, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m5
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movu [r2 + r3 + 16], m3
+
+ movq m2, [r0 + 16]
+ movq m3, [r0 + r1 + 16]
+ movq m4, [r5 + 16]
+ movq m5, [r5 + r1 + 16]
+
+ punpcklbw m2, m3
+ punpcklbw m7, m4, m5
+
+ pmaddubsw m2, m1
+ pmaddubsw m7, m0
+
+ paddw m2, m7
+ psubw m2, m6
+
+ movu [r2 + 32], m2
+
+ movq m2, [r5 + 2 * r1 + 16]
+
+ punpcklbw m3, m4
+ punpcklbw m5, m2
+
+ pmaddubsw m3, m1
+ pmaddubsw m5, m0
+
+ paddw m3, m5
+ psubw m3, m6
+
+ movu [r2 + r3 + 32], m3
+
+ mov r0, r5
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V4_PS_W24 24, 32
+
+FILTER_V4_PS_W24 24, 64
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+
+ mova m7, [pw_2000]
+
+ mov r4d, %2
+
+.loop:
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m3, [r5]
+ movu m5, [r5 + r1]
+
+ punpcklbw m6, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m6, m0
+ pmaddubsw m3, m0
+
+ paddw m4, m6
+ paddw m2, m3
+
+ psubw m4, m7
+ psubw m2, m7
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ movu m2, [r0 + 16]
+ movu m3, [r0 + r1 + 16]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ movu m3, [r5 + 16]
+ movu m5, [r5 + r1 + 16]
+
+ punpcklbw m6, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m6, m0
+ pmaddubsw m3, m0
+
+ paddw m4, m6
+ paddw m2, m3
+
+ psubw m4, m7
+ psubw m2, m7
+
+ movu [r2 + 32], m4
+ movu [r2 + 48], m2
+
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W32 32, 8
+FILTER_V_PS_W32 32, 16
+FILTER_V_PS_W32 32, 24
+FILTER_V_PS_W32 32, 32
+
+FILTER_V_PS_W32 32, 48
+FILTER_V_PS_W32 32, 64
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W8_H8_H16_H32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m5, [r5 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pshufb m5, [tab_Vm + 16]
+mova m4, [pw_512]
+lea r5, [r1 * 3]
+
+mov r4d, %2
+
+.loop:
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+movq m3, [r0 + r5]
+
+punpcklbw m0, m1
+punpcklbw m1, m2
+punpcklbw m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m2, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movh [r2], m0
+
+lea r0, [r0 + 4 * r1]
+movq m0, [r0]
+
+punpcklbw m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m3, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+movh [r2 + r3], m1
+
+movq m1, [r0 + r1]
+
+punpcklbw m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m0, m5
+
+paddw m2, m0
+
+pmulhrsw m2, m4
+
+movq m7, [r0 + 2 * r1]
+punpcklbw m1, m7
+
+pmaddubsw m3, m6
+pmaddubsw m1, m5
+
+paddw m3, m1
+
+pmulhrsw m3, m4
+packuswb m2, m3
+
+lea r2, [r2 + 2 * r3]
+movh [r2], m2
+movhps [r2 + r3], m2
+
+lea r2, [r2 + 2 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W8_H8_H16_H32 8, 8
+FILTER_V4_W8_H8_H16_H32 8, 16
+FILTER_V4_W8_H8_H16_H32 8, 32
+
+FILTER_V4_W8_H8_H16_H32 8, 12
+FILTER_V4_W8_H8_H16_H32 8, 64
+
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W6_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m5, [r5 + r4 * 4]
+%else
+movd m5, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m6, m5, [tab_Vm]
+pshufb m5, [tab_Vm + 16]
+mova m4, [pw_512]
+
+mov r4d, %2
+lea r5, [3 * r1]
+
+.loop:
+movq m0, [r0]
+movq m1, [r0 + r1]
+movq m2, [r0 + 2 * r1]
+movq m3, [r0 + r5]
+
+punpcklbw m0, m1
+punpcklbw m1, m2
+punpcklbw m2, m3
+
+pmaddubsw m0, m6
+pmaddubsw m7, m2, m5
+
+paddw m0, m7
+
+pmulhrsw m0, m4
+packuswb m0, m0
+movd [r2], m0
+pextrw [r2 + 4], m0, 2
+
+lea r0, [r0 + 4 * r1]
+
+movq m0, [r0]
+punpcklbw m3, m0
+
+pmaddubsw m1, m6
+pmaddubsw m7, m3, m5
+
+paddw m1, m7
+
+pmulhrsw m1, m4
+packuswb m1, m1
+movd [r2 + r3], m1
+pextrw [r2 + r3 + 4], m1, 2
+
+movq m1, [r0 + r1]
+punpcklbw m7, m0, m1
+
+pmaddubsw m2, m6
+pmaddubsw m7, m5
+
+paddw m2, m7
+
+pmulhrsw m2, m4
+packuswb m2, m2
+lea r2, [r2 + 2 * r3]
+movd [r2], m2
+pextrw [r2 + 4], m2, 2
+
+movq m2, [r0 + 2 * r1]
+punpcklbw m1, m2
+
+pmaddubsw m3, m6
+pmaddubsw m1, m5
+
+paddw m3, m1
+
+pmulhrsw m3, m4
+packuswb m3, m3
+
+movd [r2 + r3], m3
+pextrw [r2 + r3 + 4], m3, 2
+
+lea r2, [r2 + 2 * r3]
+
+sub r4, 4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W6_H4 6, 8
+
+FILTER_V4_W6_H4 6, 16
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W12_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2
+
+.loop:
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r0, [r0 + 2 * r1]
+movu m5, [r0]
+movu m7, [r0 + r1]
+
+punpcklbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m4, m6
+
+punpckhbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m2, m6
+
+mova m6, [pw_512]
+
+pmulhrsw m4, m6
+pmulhrsw m2, m6
+
+packuswb m4, m2
+
+movh [r2], m4
+pextrd [r2 + 8], m4, 2
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m5, [r0 + 2 * r1]
+
+punpcklbw m2, m7, m5
+punpckhbw m7, m5
+
+pmaddubsw m2, m0
+pmaddubsw m7, m0
+
+paddw m4, m2
+paddw m3, m7
+
+pmulhrsw m4, m6
+pmulhrsw m3, m6
+
+packuswb m4, m3
+
+movh [r2 + r3], m4
+pextrd [r2 + r3 + 8], m4, 2
+
+lea r2, [r2 + 2 * r3]
+
+sub r4, 2
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W12_H2 12, 16
+
+FILTER_V4_W12_H2 12, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2/2
+
+.loop:
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r0, [r0 + 2 * r1]
+movu m5, [r0]
+movu m6, [r0 + r1]
+
+punpckhbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m2, m7
+
+punpcklbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m4, m7
+
+mova m7, [pw_512]
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m5, [r0 + 2 * r1]
+
+punpcklbw m2, m6, m5
+punpckhbw m6, m5
+
+pmaddubsw m2, m0
+pmaddubsw m6, m0
+
+paddw m4, m2
+paddw m3, m6
+
+pmulhrsw m4, m7
+pmulhrsw m3, m7
+
+packuswb m4, m3
+
+movu [r2 + r3], m4
+
+lea r2, [r2 + 2 * r3]
+
+dec r4d
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W16_H2 16, 4
+FILTER_V4_W16_H2 16, 8
+FILTER_V4_W16_H2 16, 12
+FILTER_V4_W16_H2 16, 16
+FILTER_V4_W16_H2 16, 32
+
+FILTER_V4_W16_H2 16, 24
+FILTER_V4_W16_H2 16, 64
+
+;-----------------------------------------------------------------------------
+;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W24 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2
+
+.loop:
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m5, [r5]
+movu m7, [r5 + r1]
+
+punpcklbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m4, m6
+
+punpckhbw m6, m5, m7
+pmaddubsw m6, m0
+paddw m2, m6
+
+mova m6, [pw_512]
+
+pmulhrsw m4, m6
+pmulhrsw m2, m6
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m2, [r5 + 2 * r1]
+
+punpcklbw m5, m7, m2
+punpckhbw m7, m2
+
+pmaddubsw m5, m0
+pmaddubsw m7, m0
+
+paddw m4, m5
+paddw m3, m7
+
+pmulhrsw m4, m6
+pmulhrsw m3, m6
+
+packuswb m4, m3
+
+movu [r2 + r3], m4
+
+movq m2, [r0 + 16]
+movq m3, [r0 + r1 + 16]
+movq m4, [r5 + 16]
+movq m5, [r5 + r1 + 16]
+
+punpcklbw m2, m3
+punpcklbw m4, m5
+
+pmaddubsw m2, m1
+pmaddubsw m4, m0
+
+paddw m2, m4
+
+pmulhrsw m2, m6
+
+movq m3, [r0 + r1 + 16]
+movq m4, [r5 + 16]
+movq m5, [r5 + r1 + 16]
+movq m7, [r5 + 2 * r1 + 16]
+
+punpcklbw m3, m4
+punpcklbw m5, m7
+
+pmaddubsw m3, m1
+pmaddubsw m5, m0
+
+paddw m3, m5
+
+pmulhrsw m3, m6
+packuswb m2, m3
+
+movh [r2 + 16], m2
+movhps [r2 + r3 + 16], m2
+
+mov r0, r5
+lea r2, [r2 + 2 * r3]
+
+sub r4, 2
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W24 24, 32
+
+FILTER_V4_W24 24, 64
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W32 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mova m7, [pw_512]
+
+mov r4d, %2
+
+.loop:
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m3, [r5]
+movu m5, [r5 + r1]
+
+punpcklbw m6, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m6, m0
+pmaddubsw m3, m0
+
+paddw m4, m6
+paddw m2, m3
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+movu m2, [r0 + 16]
+movu m3, [r0 + r1 + 16]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+movu m3, [r5 + 16]
+movu m5, [r5 + r1 + 16]
+
+punpcklbw m6, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m6, m0
+pmaddubsw m3, m0
+
+paddw m4, m6
+paddw m2, m3
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2 + 16], m4
+
+lea r0, [r0 + r1]
+lea r2, [r2 + r3]
+
+dec r4
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W32 32, 8
+FILTER_V4_W32 32, 16
+FILTER_V4_W32 32, 24
+FILTER_V4_W32 32, 32
+
+FILTER_V4_W32 32, 48
+FILTER_V4_W32 32, 64
+
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16n_H2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8
+
+mov r4d, r4m
+sub r0, r1
+
+%ifdef PIC
+lea r5, [tab_ChromaCoeff]
+movd m0, [r5 + r4 * 4]
+%else
+movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+pshufb m1, m0, [tab_Vm]
+pshufb m0, [tab_Vm + 16]
+
+mov r4d, %2/2
+
+.loop:
+
+mov r6d, %1/16
+
+.loopW:
+
+movu m2, [r0]
+movu m3, [r0 + r1]
+
+punpcklbw m4, m2, m3
+punpckhbw m2, m3
+
+pmaddubsw m4, m1
+pmaddubsw m2, m1
+
+lea r5, [r0 + 2 * r1]
+movu m5, [r5]
+movu m6, [r5 + r1]
+
+punpckhbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m2, m7
+
+punpcklbw m7, m5, m6
+pmaddubsw m7, m0
+paddw m4, m7
+
+mova m7, [pw_512]
+
+pmulhrsw m4, m7
+pmulhrsw m2, m7
+
+packuswb m4, m2
+
+movu [r2], m4
+
+punpcklbw m4, m3, m5
+punpckhbw m3, m5
+
+pmaddubsw m4, m1
+pmaddubsw m3, m1
+
+movu m5, [r5 + 2 * r1]
+
+punpcklbw m2, m6, m5
+punpckhbw m6, m5
+
+pmaddubsw m2, m0
+pmaddubsw m6, m0
+
+paddw m4, m2
+paddw m3, m6
+
+pmulhrsw m4, m7
+pmulhrsw m3, m7
+
+packuswb m4, m3
+
+movu [r2 + r3], m4
+
+add r0, 16
+add r2, 16
+dec r6d
+jnz .loopW
+
+lea r0, [r0 + r1 * 2 - %1]
+lea r2, [r2 + r3 * 2 - %1]
+
+dec r4d
+jnz .loop
+RET
+%endmacro
+
+FILTER_V4_W16n_H2 64, 64
+FILTER_V4_W16n_H2 64, 32
+FILTER_V4_W16n_H2 64, 48
+FILTER_V4_W16n_H2 48, 64
+FILTER_V4_W16n_H2 64, 16
+
+
+;-----------------------------------------------------------------------------
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal luma_p2s, 3, 7, 6
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m4, [pb_128]
+ mova m5, [tab_c_64_n64]
+
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5]
+
+ movh m0, [r6]
+ punpcklbw m0, m4
+ pmaddubsw m0, m5
+
+ movh m1, [r6 + r1]
+ punpcklbw m1, m4
+ pmaddubsw m1, m5
+
+ movh m2, [r6 + r1 * 2]
+ punpcklbw m2, m4
+ pmaddubsw m2, m5
+
+ lea r6, [r6 + r1 * 2]
+ movh m3, [r6 + r1]
+ punpcklbw m3, m4
+ pmaddubsw m3, m5
+
+ add r5, 8
+ cmp r5, r3
+ jg .width4
+ movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jmp .loopW
+
+.width4:
+ movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
+ movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
+ movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
+ movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r4d, 4
+ jnz .loopH
+
+ RET
+
+%macro PROCESS_LUMA_W4_4R 0
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ punpcklbw m2, m0, m1 ; m2=[0 1]
+
+ lea r0, [r0 + 2 * r1]
+ movd m0, [r0]
+ punpcklbw m1, m0 ; m1=[1 2]
+ punpcklqdq m2, m1 ; m2=[0 1 1 2]
+ pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]
+
+ movd m1, [r0 + r1]
+ punpcklbw m5, m0, m1 ; m2=[2 3]
+ lea r0, [r0 + 2 * r1]
+ movd m0, [r0]
+ punpcklbw m1, m0 ; m1=[3 4]
+ punpcklqdq m5, m1 ; m5=[2 3 3 4]
+ pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]
+ paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2
+ pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4
+
+ movd m1, [r0 + r1]
+ punpcklbw m2, m0, m1 ; m2=[4 5]
+ lea r0, [r0 + 2 * r1]
+ movd m0, [r0]
+ punpcklbw m1, m0 ; m1=[5 6]
+ punpcklqdq m2, m1 ; m2=[4 5 5 6]
+ pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]
+ paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
+ pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]
+ paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4
+
+ movd m1, [r0 + r1]
+ punpcklbw m2, m0, m1 ; m2=[6 7]
+ lea r0, [r0 + 2 * r1]
+ movd m0, [r0]
+ punpcklbw m1, m0 ; m1=[7 8]
+ punpcklqdq m2, m1 ; m2=[6 7 7 8]
+ pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]
+ paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
+ pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]
+ paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
+
+ movd m1, [r0 + r1]
+ punpcklbw m2, m0, m1 ; m2=[8 9]
+ movd m0, [r0 + 2 * r1]
+ punpcklbw m1, m0 ; m1=[9 10]
+ punpcklqdq m2, m1 ; m2=[8 9 9 10]
+ pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]
+ paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
+%endmacro
+
+%macro PROCESS_LUMA_W8_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m0, [r0]
+ punpcklbw m1, m0
+ pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2
+
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3
+ pmaddubsw m0, [r6 + 1 * 16]
+ paddw m7, m0 ;m7=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m0, [r0]
+ punpcklbw m1, m0
+ pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4
+ pmaddubsw m1, [r6 + 1 * 16]
+ paddw m6, m1 ;m6 = [1+2+3+4] Row2
+
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m2, m0, [r6 + 1 * 16]
+ pmaddubsw m0, [r6 + 2 * 16]
+ paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1
+ paddw m5, m2 ;m5=[2+3+4+5] Row3
+
+ lea r0, [r0 + 2 * r1]
+ movq m0, [r0]
+ punpcklbw m1, m0
+ pmaddubsw m2, m1, [r6 + 1 * 16]
+ pmaddubsw m1, [r6 + 2 * 16]
+ paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2
+ paddw m4, m2 ;m4=[3+4+5+6] Row4
+
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m2, m0, [r6 + 2 * 16]
+ pmaddubsw m0, [r6 + 3 * 16]
+ paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end
+ paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3
+
+ lea r0, [r0 + 2 * r1]
+ movq m0, [r0]
+ punpcklbw m1, m0
+ pmaddubsw m2, m1, [r6 + 2 * 16]
+ pmaddubsw m1, [r6 + 3 * 16]
+ paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end
+ paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4
+
+ movq m1, [r0 + r1]
+ punpcklbw m0, m1
+ pmaddubsw m0, [r6 + 3 * 16]
+ paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m0, [r0 + 2 * r1]
+ punpcklbw m1, m0
+ pmaddubsw m1, [r6 + 3 * 16]
+ paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_4xN 3
+INIT_XMM sse4
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+%ifidn %3,ps
+ add r3d, r3d
+%endif
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffVer + r4]
+%endif
+
+%ifidn %3,pp
+ mova m3, [pw_512]
+%else
+ mova m3, [pw_2000]
+%endif
+
+ mov r4d, %2/4
+ lea r5, [4 * r1]
+
+.loopH:
+ PROCESS_LUMA_W4_4R
+
+%ifidn %3,pp
+ pmulhrsw m4, m3
+ pmulhrsw m5, m3
+
+ packuswb m4, m5
+
+ movd [r2], m4
+ pextrd [r2 + r3], m4, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m4, 2
+ pextrd [r2 + r3], m4, 3
+%else
+ psubw m4, m3
+ psubw m5, m3
+
+ movlps [r2], m4
+ movhps [r2 + r3], m4
+ lea r2, [r2 + 2 * r3]
+ movlps [r2], m5
+ movhps [r2 + r3], m5
+%endif
+
+ sub r0, r5
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+
+INIT_YMM avx2
+cglobal interp_8tap_vert_pp_4x4, 4,6,8
+ mov r4d, r4m
+ lea r5, [r1 * 3]
+ sub r0, r5
+
+ ; TODO: VPGATHERDD
+ movd xm1, [r0] ; m1 = row0
+ movd xm2, [r0 + r1] ; m2 = row1
+ punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]
+
+ movd xm3, [r0 + r1 * 2] ; m3 = row2
+ punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]
+ movd xm4, [r0 + r5]
+ punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]
+ punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm5, [r0] ; m5 = row4
+ punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]
+ punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]
+ vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]
+ movd xm2, [r0 + r1] ; m2 = row5
+ punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]
+ punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm6, [r0 + r1 * 2] ; m6 = row6
+ punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]
+ punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]
+ vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]
+ movd xm4, [r0 + r5] ; m4 = row7
+ punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]
+ punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+
+ lea r0, [r0 + r1 * 4]
+ movd xm7, [r0] ; m7 = row8
+ punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]
+ punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]
+ vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]
+ movd xm2, [r0 + r1] ; m2 = row9
+ punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]
+ punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+ movd xm7, [r0 + r1 * 2] ; m7 = rowA
+ punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]
+ punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]
+ vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]
+
+ ; load filter coeff
+%ifdef PIC
+ lea r5, [tab_LumaCoeff]
+ vpbroadcastd m0, [r5 + r4 * 8 + 0]
+ vpbroadcastd m2, [r5 + r4 * 8 + 4]
+%else
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]
+ vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]
+%endif
+
+ pmaddubsw m1, m0
+ pmaddubsw m3, m0
+ pmaddubsw m5, m2
+ pmaddubsw m6, m2
+ vbroadcasti128 m0, [pw_1]
+ pmaddwd m1, m0
+ pmaddwd m3, m0
+ pmaddwd m5, m0
+ pmaddwd m6, m0
+ paddd m1, m5 ; m1 = DQWORD ROW[1 0]
+ paddd m3, m6 ; m3 = DQWORD ROW[3 2]
+ packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]
+
+ ; TODO: does it overflow?
+ pmulhrsw m1, [pw_512]
+ vextracti128 xm2, m1, 1
+ packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]
+ movd [r2], xm1
+ pextrd [r2 + r3], xm1, 2
+ pextrd [r2 + r3 * 2], xm1, 1
+ lea r4, [r3 * 3]
+ pextrd [r2 + r4], xm1, 3
+ RET
+
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 4, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 8, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 16, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 4, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 8, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_8xN 3
+INIT_XMM sse4
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifidn %3,ps
+ add r3d, r3d
+%endif
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffVer + r4]
+%endif
+
+ %ifidn %3,pp
+ mova m3, [pw_512]
+%else
+ mova m3, [pw_2000]
+%endif
+
+ mov r4d, %2/4
+ lea r5, [4 * r1]
+
+.loopH:
+ PROCESS_LUMA_W8_4R
+
+%ifidn %3,pp
+ pmulhrsw m7, m3
+ pmulhrsw m6, m3
+ pmulhrsw m5, m3
+ pmulhrsw m4, m3
+
+ packuswb m7, m6
+ packuswb m5, m4
+
+ movlps [r2], m7
+ movhps [r2 + r3], m7
+ lea r2, [r2 + 2 * r3]
+ movlps [r2], m5
+ movhps [r2 + r3], m5
+%else
+ psubw m7, m3
+ psubw m6, m3
+ psubw m5, m3
+ psubw m4, m3
+
+ movu [r2], m7
+ movu [r2 + r3], m6
+ lea r2, [r2 + 2 * r3]
+ movu [r2], m5
+ movu [r2 + r3], m4
+%endif
+
+ sub r0, r5
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 4, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 8, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 16, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 32, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 4, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 8, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 32, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_12xN 3
+INIT_XMM sse4
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+%ifidn %3,ps
+ add r3d, r3d
+%endif
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffVer + r4]
+%endif
+
+ %ifidn %3,pp
+ mova m3, [pw_512]
+%else
+ mova m3, [pw_2000]
+%endif
+
+ mov r4d, %2/4
+
+.loopH:
+ PROCESS_LUMA_W8_4R
+
+%ifidn %3,pp
+ pmulhrsw m7, m3
+ pmulhrsw m6, m3
+ pmulhrsw m5, m3
+ pmulhrsw m4, m3
+
+ packuswb m7, m6
+ packuswb m5, m4
+
+ movlps [r2], m7
+ movhps [r2 + r3], m7
+ lea r5, [r2 + 2 * r3]
+ movlps [r5], m5
+ movhps [r5 + r3], m5
+%else
+ psubw m7, m3
+ psubw m6, m3
+ psubw m5, m3
+ psubw m4, m3
+
+ movu [r2], m7
+ movu [r2 + r3], m6
+ lea r5, [r2 + 2 * r3]
+ movu [r5], m5
+ movu [r5 + r3], m4
+%endif
+
+ lea r5, [8 * r1 - 8]
+ sub r0, r5
+%ifidn %3,pp
+ add r2, 8
+%else
+ add r2, 16
+%endif
+
+ PROCESS_LUMA_W4_4R
+
+%ifidn %3,pp
+ pmulhrsw m4, m3
+ pmulhrsw m5, m3
+
+ packuswb m4, m5
+
+ movd [r2], m4
+ pextrd [r2 + r3], m4, 1
+ lea r5, [r2 + 2 * r3]
+ pextrd [r5], m4, 2
+ pextrd [r5 + r3], m4, 3
+%else
+ psubw m4, m3
+ psubw m5, m3
+
+ movlps [r2], m4
+ movhps [r2 + r3], m4
+ lea r5, [r2 + 2 * r3]
+ movlps [r5], m5
+ movhps [r5 + r3], m5
+%endif
+
+ lea r5, [4 * r1 + 8]
+ sub r0, r5
+%ifidn %3,pp
+ lea r2, [r2 + 4 * r3 - 8]
+%else
+ lea r2, [r2 + 4 * r3 - 16]
+%endif
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_12xN 12, 16, pp
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_12xN 12, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA 3
+INIT_XMM sse4
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+%ifidn %3,ps
+ add r3d, r3d
+%endif
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffVer]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffVer + r4]
+%endif
+
+%ifidn %3,pp
+ mova m3, [pw_512]
+%else
+ mova m3, [pw_2000]
+%endif
+ mov dword [rsp], %2/4
+
+.loopH:
+ mov r4d, (%1/8)
+.loopW:
+ PROCESS_LUMA_W8_4R
+%ifidn %3,pp
+ pmulhrsw m7, m3
+ pmulhrsw m6, m3
+ pmulhrsw m5, m3
+ pmulhrsw m4, m3
+
+ packuswb m7, m6
+ packuswb m5, m4
+
+ movlps [r2], m7
+ movhps [r2 + r3], m7
+ lea r5, [r2 + 2 * r3]
+ movlps [r5], m5
+ movhps [r5 + r3], m5
+%else
+ psubw m7, m3
+ psubw m6, m3
+ psubw m5, m3
+ psubw m4, m3
+
+ movu [r2], m7
+ movu [r2 + r3], m6
+ lea r5, [r2 + 2 * r3]
+ movu [r5], m5
+ movu [r5 + r3], m4
+%endif
+
+ lea r5, [8 * r1 - 8]
+ sub r0, r5
+%ifidn %3,pp
+ add r2, 8
+%else
+ add r2, 16
+%endif
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - %1]
+%ifidn %3,pp
+ lea r2, [r2 + 4 * r3 - %1]
+%else
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+%endif
+
+ dec dword [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_LUMA 16, 4, pp
+FILTER_VER_LUMA 16, 8, pp
+FILTER_VER_LUMA 16, 12, pp
+FILTER_VER_LUMA 16, 16, pp
+FILTER_VER_LUMA 16, 32, pp
+FILTER_VER_LUMA 16, 64, pp
+FILTER_VER_LUMA 24, 32, pp
+FILTER_VER_LUMA 32, 8, pp
+FILTER_VER_LUMA 32, 16, pp
+FILTER_VER_LUMA 32, 24, pp
+FILTER_VER_LUMA 32, 32, pp
+FILTER_VER_LUMA 32, 64, pp
+FILTER_VER_LUMA 48, 64, pp
+FILTER_VER_LUMA 64, 16, pp
+FILTER_VER_LUMA 64, 32, pp
+FILTER_VER_LUMA 64, 48, pp
+FILTER_VER_LUMA 64, 64, pp
+
+FILTER_VER_LUMA 16, 4, ps
+FILTER_VER_LUMA 16, 8, ps
+FILTER_VER_LUMA 16, 12, ps
+FILTER_VER_LUMA 16, 16, ps
+FILTER_VER_LUMA 16, 32, ps
+FILTER_VER_LUMA 16, 64, ps
+FILTER_VER_LUMA 24, 32, ps
+FILTER_VER_LUMA 32, 8, ps
+FILTER_VER_LUMA 32, 16, ps
+FILTER_VER_LUMA 32, 24, ps
+FILTER_VER_LUMA 32, 32, ps
+FILTER_VER_LUMA 32, 64, ps
+FILTER_VER_LUMA 48, 64, ps
+FILTER_VER_LUMA 64, 16, ps
+FILTER_VER_LUMA 64, 32, ps
+FILTER_VER_LUMA 64, 48, ps
+FILTER_VER_LUMA 64, 64, ps
+
+%macro PROCESS_LUMA_SP_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SP 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize
+
+ add r1d, r1d
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mova m7, [tab_c_526336]
+
+ mov dword [rsp], %2/4
+.loopH:
+ mov r4d, (%1/4)
+.loopW:
+ PROCESS_LUMA_SP_W4_4R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m2
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r5, [r2 + 2 * r3]
+ pextrd [r5], m0, 2
+ pextrd [r5 + r3], m0, 3
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - %1]
+
+ dec dword [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ FILTER_VER_LUMA_SP 4, 4
+ FILTER_VER_LUMA_SP 8, 8
+ FILTER_VER_LUMA_SP 8, 4
+ FILTER_VER_LUMA_SP 4, 8
+ FILTER_VER_LUMA_SP 16, 16
+ FILTER_VER_LUMA_SP 16, 8
+ FILTER_VER_LUMA_SP 8, 16
+ FILTER_VER_LUMA_SP 16, 12
+ FILTER_VER_LUMA_SP 12, 16
+ FILTER_VER_LUMA_SP 16, 4
+ FILTER_VER_LUMA_SP 4, 16
+ FILTER_VER_LUMA_SP 32, 32
+ FILTER_VER_LUMA_SP 32, 16
+ FILTER_VER_LUMA_SP 16, 32
+ FILTER_VER_LUMA_SP 32, 24
+ FILTER_VER_LUMA_SP 24, 32
+ FILTER_VER_LUMA_SP 32, 8
+ FILTER_VER_LUMA_SP 8, 32
+ FILTER_VER_LUMA_SP 64, 64
+ FILTER_VER_LUMA_SP 64, 32
+ FILTER_VER_LUMA_SP 32, 64
+ FILTER_VER_LUMA_SP 64, 48
+ FILTER_VER_LUMA_SP 48, 64
+ FILTER_VER_LUMA_SP 64, 16
+ FILTER_VER_LUMA_SP 16, 64
+
+; TODO: combin of U and V is more performance, but need more register
+; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it
+INIT_XMM ssse3
+cglobal chroma_p2s, 3, 7, 4
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+
+ ; load constant
+ mova m2, [pb_128]
+ mova m3, [tab_c_64_n64]
+
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5]
+
+ movh m0, [r6]
+ punpcklbw m0, m2
+ pmaddubsw m0, m3
+
+ movh m1, [r6 + r1]
+ punpcklbw m1, m2
+ pmaddubsw m1, m3
+
+ add r5d, 8
+ cmp r5d, r3d
+ lea r6, [r2 + r5 * 2]
+ jg .width4
+ movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+ je .nextH
+ jmp .loopW
+
+.width4:
+ test r3d, 4
+ jz .width2
+ test r3d, 2
+ movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+ lea r6, [r6 + 8]
+ pshufd m0, m0, 2
+ pshufd m1, m1, 2
+ jz .nextH
+
+.width2:
+ movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+
+.nextH:
+ lea r0, [r0 + r1 * 2]
+ add r2, FENC_STRIDE / 2 * 4
+
+ sub r4d, 2
+ jnz .loopH
+
+ RET
+
+%macro PROCESS_CHROMA_SP_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1 done
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5] Row3
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6] Row4
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m6, [tab_c_526336]
+
+ mov dword [rsp], %2/4
+
+.loopH:
+ mov r4d, (%1/4)
+.loopW:
+ PROCESS_CHROMA_SP_W4_4R
+
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m2
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r5, [r2 + 2 * r3]
+ pextrd [r5], m0, 2
+ pextrd [r5 + r3], m0, 3
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - %1]
+
+ dec dword [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_CHROMA_SP 4, 4
+ FILTER_VER_CHROMA_SP 4, 8
+ FILTER_VER_CHROMA_SP 16, 16
+ FILTER_VER_CHROMA_SP 16, 8
+ FILTER_VER_CHROMA_SP 16, 12
+ FILTER_VER_CHROMA_SP 12, 16
+ FILTER_VER_CHROMA_SP 16, 4
+ FILTER_VER_CHROMA_SP 4, 16
+ FILTER_VER_CHROMA_SP 32, 32
+ FILTER_VER_CHROMA_SP 32, 16
+ FILTER_VER_CHROMA_SP 16, 32
+ FILTER_VER_CHROMA_SP 32, 24
+ FILTER_VER_CHROMA_SP 24, 32
+ FILTER_VER_CHROMA_SP 32, 8
+
+ FILTER_VER_CHROMA_SP 16, 24
+ FILTER_VER_CHROMA_SP 16, 64
+ FILTER_VER_CHROMA_SP 12, 32
+ FILTER_VER_CHROMA_SP 4, 32
+ FILTER_VER_CHROMA_SP 32, 64
+ FILTER_VER_CHROMA_SP 32, 48
+ FILTER_VER_CHROMA_SP 24, 64
+
+ FILTER_VER_CHROMA_SP 64, 64
+ FILTER_VER_CHROMA_SP 64, 32
+ FILTER_VER_CHROMA_SP 64, 48
+ FILTER_VER_CHROMA_SP 48, 64
+ FILTER_VER_CHROMA_SP 64, 16
+
+
+%macro PROCESS_CHROMA_SP_W2_4R 1
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+
+ lea r0, [r0 + 2 * r1]
+ movd m2, [r0]
+ punpcklwd m1, m2 ;m1=[1 2]
+ punpcklqdq m0, m1 ;m0=[0 1 1 2]
+ pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
+
+ movd m1, [r0 + r1]
+ punpcklwd m2, m1 ;m2=[2 3]
+
+ lea r0, [r0 + 2 * r1]
+ movd m3, [r0]
+ punpcklwd m1, m3 ;m2=[3 4]
+ punpcklqdq m2, m1 ;m2=[2 3 3 4]
+
+ pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
+ pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
+ paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
+
+ movd m1, [r0 + r1]
+ punpcklwd m3, m1 ;m3=[4 5]
+
+ movd m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[5 6]
+ punpcklqdq m3, m1 ;m2=[4 5 5 6]
+ pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
+ paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W2_4R 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m5, [tab_c_526336]
+
+ mov r4d, (%2/4)
+
+.loopH:
+ PROCESS_CHROMA_SP_W2_4R r5
+
+ paddd m0, m5
+ paddd m2, m5
+
+ psrad m0, 12
+ psrad m2, 12
+
+ packssdw m0, m2
+ packuswb m0, m0
+
+ pextrw [r2], m0, 0
+ pextrw [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2], m0, 2
+ pextrw [r2 + r3], m0, 3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W2_4R 2, 4
+FILTER_VER_CHROMA_SP_W2_4R 2, 8
+
+FILTER_VER_CHROMA_SP_W2_4R 2, 16
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_sp_4x2, 5, 6, 5
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m4, [tab_c_526336]
+
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m2, [r0]
+ punpcklwd m1, m2 ;m1=[1 2]
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
+
+ movq m3, [r0 + r1]
+ punpcklwd m2, m3 ;m4=[2 3]
+ pmaddwd m2, [r5 + 1 * 16]
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done
+ paddd m0, m4
+ psrad m0, 12
+
+ movq m2, [r0 + 2 * r1]
+ punpcklwd m3, m2 ;m5=[3 4]
+ pmaddwd m3, [r5 + 1 * 16]
+ paddd m1, m3 ;m1 = [1+2+3+4] Row2 done
+ paddd m1, m4
+ psrad m1, 12
+
+ packssdw m0, m1
+ packuswb m0, m0
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+
+ RET
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W6_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m6, [tab_c_526336]
+
+ mov r4d, %2/4
+
+.loopH:
+ PROCESS_CHROMA_SP_W4_4R
+
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m2
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r5, [r2 + 2 * r3]
+ pextrd [r5], m0, 2
+ pextrd [r5 + r3], m0, 3
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 4
+
+ PROCESS_CHROMA_SP_W2_4R r6
+
+ paddd m0, m6
+ paddd m2, m6
+
+ psrad m0, 12
+ psrad m2, 12
+
+ packssdw m0, m2
+ packuswb m0, m0
+
+ pextrw [r2], m0, 0
+ pextrw [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrw [r2], m0, 2
+ pextrw [r2 + r3], m0, 3
+
+ sub r0, 2 * 4
+ lea r2, [r2 + 2 * r3 - 4]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SP_W6_H4 6, 16
+
+%macro PROCESS_CHROMA_SP_W8_2R 0
+ movu m1, [r0]
+ movu m3, [r0 + r1]
+ punpcklwd m0, m1, m3
+ pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
+ punpckhwd m1, m3
+ pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m2, m3, m4
+ pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
+ punpckhwd m3, m4
+ pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0 + r1]
+ punpcklwd m6, m4, m5
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
+ paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
+ punpckhwd m4, m5
+ pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
+ paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m6, m5, m4
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
+ paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
+ punpckhwd m5, m4
+ pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
+ paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SP_W8_H2 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8
+
+ add r1d, r1d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mova m7, [tab_c_526336]
+
+ mov r4d, %2/2
+.loopH:
+ PROCESS_CHROMA_SP_W8_2R
+
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+
+ psrad m0, 12
+ psrad m1, 12
+ psrad m2, 12
+ psrad m3, 12
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ packuswb m0, m2
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SP_W8_H2 8, 2
+FILTER_VER_CHROMA_SP_W8_H2 8, 4
+FILTER_VER_CHROMA_SP_W8_H2 8, 6
+FILTER_VER_CHROMA_SP_W8_H2 8, 8
+FILTER_VER_CHROMA_SP_W8_H2 8, 16
+FILTER_VER_CHROMA_SP_W8_H2 8, 32
+
+FILTER_VER_CHROMA_SP_W8_H2 8, 12
+FILTER_VER_CHROMA_SP_W8_H2 8, 64
+
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+%macro FILTER_HORIZ_CHROMA_2xN 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
+%define coef2 m3
+%define Tm0 m2
+%define t1 m1
+%define t0 m0
+
+ dec srcq
+ mov r4d, r4m
+ add dststrided, dststrided
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd coef2, [r6 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t1, [pw_2000]
+ mova Tm0, [tab_Tm]
+
+ mov r4d, %2
+ cmp r5m, byte 0
+ je .loopH
+ sub srcq, srcstrideq
+ add r4d, 3
+
+.loopH:
+ movh t0, [srcq]
+ pshufb t0, t0, Tm0
+ pmaddubsw t0, coef2
+ phaddw t0, t0
+ psubw t0, t1
+ movd [dstq], t0
+
+ lea srcq, [srcq + srcstrideq]
+ lea dstq, [dstq + dststrideq]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_HORIZ_CHROMA_2xN 2, 4
+FILTER_HORIZ_CHROMA_2xN 2, 8
+
+FILTER_HORIZ_CHROMA_2xN 2, 16
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+%macro FILTER_HORIZ_CHROMA_4xN 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
+%define coef2 m3
+%define Tm0 m2
+%define t1 m1
+%define t0 m0
+
+ dec srcq
+ mov r4d, r4m
+ add dststrided, dststrided
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd coef2, [r6 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t1, [pw_2000]
+ mova Tm0, [tab_Tm]
+
+ mov r4d, %2
+ cmp r5m, byte 0
+ je .loopH
+ sub srcq, srcstrideq
+ add r4d, 3
+
+.loopH:
+ movh t0, [srcq]
+ pshufb t0, t0, Tm0
+ pmaddubsw t0, coef2
+ phaddw t0, t0
+ psubw t0, t1
+ movlps [dstq], t0
+
+ lea srcq, [srcq + srcstrideq]
+ lea dstq, [dstq + dststrideq]
+
+ dec r4d
+ jnz .loopH
+ RET
+%endmacro
+
+FILTER_HORIZ_CHROMA_4xN 4, 2
+FILTER_HORIZ_CHROMA_4xN 4, 4
+FILTER_HORIZ_CHROMA_4xN 4, 8
+FILTER_HORIZ_CHROMA_4xN 4, 16
+
+FILTER_HORIZ_CHROMA_4xN 4, 32
+
+%macro PROCESS_CHROMA_W6 3
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ psubw %2, %3
+ movh [dstq], %2
+ pshufd %2, %2, 2
+ movd [dstq + 8], %2
+%endmacro
+
+%macro PROCESS_CHROMA_W12 3
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ psubw %2, %3
+ movu [dstq], %2
+ movu %1, [srcq + 8]
+ pshufb %1, %1, Tm0
+ pmaddubsw %1, coef2
+ phaddw %1, %1
+ psubw %1, %3
+ movh [dstq + 16], %1
+%endmacro
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+%macro FILTER_HORIZ_CHROMA 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
+%define coef2 m5
+%define Tm0 m4
+%define Tm1 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+ dec srcq
+ mov r4d, r4m
+ add dststrided, dststrided
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd coef2, [r6 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_2000]
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+ mov r4d, %2
+ cmp r5m, byte 0
+ je .loopH
+ sub srcq, srcstrideq
+ add r4d, 3
+
+.loopH:
+ PROCESS_CHROMA_W%1 t0, t1, t2
+ add srcq, srcstrideq
+ add dstq, dststrideq
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_HORIZ_CHROMA 6, 8
+FILTER_HORIZ_CHROMA 12, 16
+
+FILTER_HORIZ_CHROMA 6, 16
+FILTER_HORIZ_CHROMA 12, 32
+
+%macro PROCESS_CHROMA_W8 3
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ psubw %2, %3
+ movu [dstq], %2
+%endmacro
+
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+%macro FILTER_HORIZ_CHROMA_8xN 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
+%define coef2 m5
+%define Tm0 m4
+%define Tm1 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+ dec srcq
+ mov r4d, r4m
+ add dststrided, dststrided
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd coef2, [r6 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_2000]
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+ mov r4d, %2
+ cmp r5m, byte 0
+ je .loopH
+ sub srcq, srcstrideq
+ add r4d, 3
+
+.loopH:
+ PROCESS_CHROMA_W8 t0, t1, t2
+ add srcq, srcstrideq
+ add dstq, dststrideq
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_HORIZ_CHROMA_8xN 8, 2
+FILTER_HORIZ_CHROMA_8xN 8, 4
+FILTER_HORIZ_CHROMA_8xN 8, 6
+FILTER_HORIZ_CHROMA_8xN 8, 8
+FILTER_HORIZ_CHROMA_8xN 8, 16
+FILTER_HORIZ_CHROMA_8xN 8, 32
+
+FILTER_HORIZ_CHROMA_8xN 8, 12
+FILTER_HORIZ_CHROMA_8xN 8, 64
+
+%macro PROCESS_CHROMA_W16 4
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq], %2
+ movu [dstq + 16], %4
+%endmacro
+
+%macro PROCESS_CHROMA_W24 4
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq], %2
+ movu [dstq + 16], %4
+ movu %1, [srcq + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ psubw %2, %3
+ movu [dstq + 32], %2
+%endmacro
+
+%macro PROCESS_CHROMA_W32 4
+ movu %1, [srcq]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq], %2
+ movu [dstq + 16], %4
+ movu %1, [srcq + 16]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + 24]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq + 32], %2
+ movu [dstq + 48], %4
+%endmacro
+
+%macro PROCESS_CHROMA_W16o 5
+ movu %1, [srcq + %5]
+ pshufb %2, %1, Tm0
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ movu %1, [srcq + %5 + 8]
+ pshufb %4, %1, Tm0
+ pmaddubsw %4, coef2
+ pshufb %1, %1, Tm1
+ pmaddubsw %1, coef2
+ phaddw %4, %1
+ psubw %2, %3
+ psubw %4, %3
+ movu [dstq + %5 * 2], %2
+ movu [dstq + %5 * 2 + 16], %4
+%endmacro
+
+%macro PROCESS_CHROMA_W48 4
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+%endmacro
+
+%macro PROCESS_CHROMA_W64 4
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 48
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;------------------------------------------------------------------------------------------------------------------------------
+%macro FILTER_HORIZ_CHROMA_WxN 2
+INIT_XMM sse4
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
+%define coef2 m6
+%define Tm0 m5
+%define Tm1 m4
+%define t3 m3
+%define t2 m2
+%define t1 m1
+%define t0 m0
+
+ dec srcq
+ mov r4d, r4m
+ add dststrided, dststrided
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movd coef2, [r6 + r4 * 4]
+%else
+ movd coef2, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufd coef2, coef2, 0
+ mova t2, [pw_2000]
+ mova Tm0, [tab_Tm]
+ mova Tm1, [tab_Tm + 16]
+
+ mov r4d, %2
+ cmp r5m, byte 0
+ je .loopH
+ sub srcq, srcstrideq
+ add r4d, 3
+
+.loopH:
+ PROCESS_CHROMA_W%1 t0, t1, t2, t3
+ add srcq, srcstrideq
+ add dstq, dststrideq
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_HORIZ_CHROMA_WxN 16, 4
+FILTER_HORIZ_CHROMA_WxN 16, 8
+FILTER_HORIZ_CHROMA_WxN 16, 12
+FILTER_HORIZ_CHROMA_WxN 16, 16
+FILTER_HORIZ_CHROMA_WxN 16, 32
+FILTER_HORIZ_CHROMA_WxN 24, 32
+FILTER_HORIZ_CHROMA_WxN 32, 8
+FILTER_HORIZ_CHROMA_WxN 32, 16
+FILTER_HORIZ_CHROMA_WxN 32, 24
+FILTER_HORIZ_CHROMA_WxN 32, 32
+
+FILTER_HORIZ_CHROMA_WxN 16, 24
+FILTER_HORIZ_CHROMA_WxN 16, 64
+FILTER_HORIZ_CHROMA_WxN 24, 64
+FILTER_HORIZ_CHROMA_WxN 32, 48
+FILTER_HORIZ_CHROMA_WxN 32, 64
+
+FILTER_HORIZ_CHROMA_WxN 64, 64
+FILTER_HORIZ_CHROMA_WxN 64, 32
+FILTER_HORIZ_CHROMA_WxN 64, 48
+FILTER_HORIZ_CHROMA_WxN 48, 64
+FILTER_HORIZ_CHROMA_WxN 64, 16
+
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W16n 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m1, m0, [tab_Vm]
+ pshufb m0, [tab_Vm + 16]
+ mov r4d, %2/2
+
+.loop:
+
+ mov r6d, %1/16
+
+.loopW:
+
+ movu m2, [r0]
+ movu m3, [r0 + r1]
+
+ punpcklbw m4, m2, m3
+ punpckhbw m2, m3
+
+ pmaddubsw m4, m1
+ pmaddubsw m2, m1
+
+ lea r5, [r0 + 2 * r1]
+ movu m5, [r5]
+ movu m7, [r5 + r1]
+
+ punpcklbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m4, m6
+
+ punpckhbw m6, m5, m7
+ pmaddubsw m6, m0
+ paddw m2, m6
+
+ mova m6, [pw_2000]
+
+ psubw m4, m6
+ psubw m2, m6
+
+ movu [r2], m4
+ movu [r2 + 16], m2
+
+ punpcklbw m4, m3, m5
+ punpckhbw m3, m5
+
+ pmaddubsw m4, m1
+ pmaddubsw m3, m1
+
+ movu m5, [r5 + 2 * r1]
+
+ punpcklbw m2, m7, m5
+ punpckhbw m7, m5
+
+ pmaddubsw m2, m0
+ pmaddubsw m7, m0
+
+ paddw m4, m2
+ paddw m3, m7
+
+ psubw m4, m6
+ psubw m3, m6
+
+ movu [r2 + r3], m4
+ movu [r2 + r3 + 16], m3
+
+ add r0, 16
+ add r2, 32
+ dec r6d
+ jnz .loopW
+
+ lea r0, [r0 + r1 * 2 - %1]
+ lea r2, [r2 + r3 * 2 - %1 * 2]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+FILTER_V_PS_W16n 64, 64
+FILTER_V_PS_W16n 64, 32
+FILTER_V_PS_W16n 64, 48
+FILTER_V_PS_W16n 48, 64
+FILTER_V_PS_W16n 64, 16
+
+
+;------------------------------------------------------------------------------------------------------------
+;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_2x4, 4, 6, 7
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ lea r5, [3 * r1]
+
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r5]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
+
+ punpcklbw m3, m4
+ punpcklbw m1, m5, m6
+ punpcklbw m3, m1
+
+ pmaddubsw m3, m0
+ phaddw m2, m3
+
+ mova m1, [pw_2000]
+
+ psubw m2, m1
+
+ movd [r2], m2
+ pextrd [r2 + r3], m2, 2
+
+ movd m2, [r0 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m2
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r0 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m2, m3
+ punpcklbw m5, m2
+
+ pmaddubsw m5, m0
+ phaddw m4, m5
+ psubw m4, m1
+
+ lea r2, [r2 + 2 * r3]
+ movd [r2], m4
+ pextrd [r2 + r3], m4, 2
+
+ RET
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_V_PS_W2 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8
+
+ mov r4d, r4m
+ sub r0, r1
+ add r3d, r3d
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeff]
+ movd m0, [r5 + r4 * 4]
+%else
+ movd m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ pshufb m0, [tab_Cm]
+
+ mova m1, [pw_2000]
+ lea r5, [3 * r1]
+ mov r4d, %2/4
+.loop:
+ movd m2, [r0]
+ movd m3, [r0 + r1]
+ movd m4, [r0 + 2 * r1]
+ movd m5, [r0 + r5]
+
+ punpcklbw m2, m3
+ punpcklbw m6, m4, m5
+ punpcklbw m2, m6
+
+ pmaddubsw m2, m0
+
+ lea r0, [r0 + 4 * r1]
+ movd m6, [r0]
+
+ punpcklbw m3, m4
+ punpcklbw m7, m5, m6
+ punpcklbw m3, m7
+
+ pmaddubsw m3, m0
+
+ phaddw m2, m3
+ psubw m2, m1
+
+
+ movd [r2], m2
+ pshufd m2, m2, 2
+ movd [r2 + r3], m2
+
+ movd m2, [r0 + r1]
+
+ punpcklbw m4, m5
+ punpcklbw m3, m6, m2
+ punpcklbw m4, m3
+
+ pmaddubsw m4, m0
+
+ movd m3, [r0 + 2 * r1]
+
+ punpcklbw m5, m6
+ punpcklbw m2, m3
+ punpcklbw m5, m2
+
+ pmaddubsw m5, m0
+
+ phaddw m4, m5
+
+ psubw m4, m1
+
+ lea r2, [r2 + 2 * r3]
+ movd [r2], m4
+ pshufd m4 , m4 ,2
+ movd [r2 + r3], m4
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+
+RET
+%endmacro
+
+FILTER_V_PS_W2 2, 8
+
+FILTER_V_PS_W2 2, 16
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov dword [rsp], %2/4
+
+.loopH:
+ mov r4d, (%1/4)
+.loopW:
+ PROCESS_CHROMA_SP_W4_4R
+
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movlps [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec dword [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_CHROMA_SS 4, 4
+ FILTER_VER_CHROMA_SS 4, 8
+ FILTER_VER_CHROMA_SS 16, 16
+ FILTER_VER_CHROMA_SS 16, 8
+ FILTER_VER_CHROMA_SS 16, 12
+ FILTER_VER_CHROMA_SS 12, 16
+ FILTER_VER_CHROMA_SS 16, 4
+ FILTER_VER_CHROMA_SS 4, 16
+ FILTER_VER_CHROMA_SS 32, 32
+ FILTER_VER_CHROMA_SS 32, 16
+ FILTER_VER_CHROMA_SS 16, 32
+ FILTER_VER_CHROMA_SS 32, 24
+ FILTER_VER_CHROMA_SS 24, 32
+ FILTER_VER_CHROMA_SS 32, 8
+
+ FILTER_VER_CHROMA_SS 16, 24
+ FILTER_VER_CHROMA_SS 12, 32
+ FILTER_VER_CHROMA_SS 4, 32
+ FILTER_VER_CHROMA_SS 32, 64
+ FILTER_VER_CHROMA_SS 16, 64
+ FILTER_VER_CHROMA_SS 32, 48
+ FILTER_VER_CHROMA_SS 24, 64
+
+ FILTER_VER_CHROMA_SS 64, 64
+ FILTER_VER_CHROMA_SS 64, 32
+ FILTER_VER_CHROMA_SS 64, 48
+ FILTER_VER_CHROMA_SS 48, 64
+ FILTER_VER_CHROMA_SS 64, 16
+
+
+;---------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W2_4R 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, (%2/4)
+
+.loopH:
+ PROCESS_CHROMA_SP_W2_4R r5
+
+ psrad m0, 6
+ psrad m2, 6
+
+ packssdw m0, m2
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m0, 2
+ pextrd [r2 + r3], m0, 3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W2_4R 2, 4
+FILTER_VER_CHROMA_SS_W2_4R 2, 8
+
+FILTER_VER_CHROMA_SS_W2_4R 2, 16
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m2, [r0]
+ punpcklwd m1, m2 ;m1=[1 2]
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
+
+ movq m3, [r0 + r1]
+ punpcklwd m2, m3 ;m4=[2 3]
+ pmaddwd m2, [r5 + 1 * 16]
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done
+ psrad m0, 6
+
+ movq m2, [r0 + 2 * r1]
+ punpcklwd m3, m2 ;m5=[3 4]
+ pmaddwd m3, [r5 + 1 * 16]
+ paddd m1, m3 ;m1=[1+2+3+4] Row2 done
+ psrad m1, 6
+
+ packssdw m0, m1
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ RET
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W6_H4 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, %2/4
+
+.loopH:
+ PROCESS_CHROMA_SP_W4_4R
+
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movlps [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ PROCESS_CHROMA_SP_W2_4R r6
+
+ psrad m0, 6
+ psrad m2, 6
+
+ packssdw m0, m2
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m0, 2
+ pextrd [r2 + r3], m0, 3
+
+ sub r0, 2 * 4
+ lea r2, [r2 + 2 * r3 - 2 * 4]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 8
+
+FILTER_VER_CHROMA_SS_W6_H4 6, 16
+
+
+;----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W8_H2 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, %2/2
+.loopH:
+ PROCESS_CHROMA_SP_W8_2R
+
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+
+ movu [r2], m0
+ movu [r2 + r3], m2
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W8_H2 8, 2
+FILTER_VER_CHROMA_SS_W8_H2 8, 4
+FILTER_VER_CHROMA_SS_W8_H2 8, 6
+FILTER_VER_CHROMA_SS_W8_H2 8, 8
+FILTER_VER_CHROMA_SS_W8_H2 8, 16
+FILTER_VER_CHROMA_SS_W8_H2 8, 32
+
+FILTER_VER_CHROMA_SS_W8_H2 8, 12
+FILTER_VER_CHROMA_SS_W8_H2 8, 64
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SS 2
+INIT_XMM sse2
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize
+
+ add r1d, r1d
+ add r3d, r3d
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 6
+
+%ifdef PIC
+ lea r5, [tab_LumaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_LumaCoeffV + r4]
+%endif
+
+ mov dword [rsp], %2/4
+.loopH:
+ mov r4d, (%1/4)
+.loopW:
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m6, m4, [r6 + 1 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5] Row3
+ pmaddwd m4, [r6 + 2 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m6, m5, [r6 + 1 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6] Row4
+ pmaddwd m5, [r6 + 2 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[6 7]
+ pmaddwd m6, m4, [r6 + 2 * 16]
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
+ psrad m0, 6
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[7 8]
+ pmaddwd m6, m5, [r6 + 2 * 16]
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
+ psrad m1, 6
+
+ packssdw m0, m1
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[8 9]
+ pmaddwd m4, [r6 + 3 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
+ psrad m2, 6
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[9 10]
+ pmaddwd m5, [r6 + 3 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
+ psrad m3, 6
+
+ packssdw m2, m3
+
+ movlps [r2 + 2 * r3], m2
+ lea r5, [3 * r3]
+ movhps [r2 + r5], m2
+
+ lea r5, [8 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec dword [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_LUMA_SS 4, 4
+ FILTER_VER_LUMA_SS 8, 8
+ FILTER_VER_LUMA_SS 8, 4
+ FILTER_VER_LUMA_SS 4, 8
+ FILTER_VER_LUMA_SS 16, 16
+ FILTER_VER_LUMA_SS 16, 8
+ FILTER_VER_LUMA_SS 8, 16
+ FILTER_VER_LUMA_SS 16, 12
+ FILTER_VER_LUMA_SS 12, 16
+ FILTER_VER_LUMA_SS 16, 4
+ FILTER_VER_LUMA_SS 4, 16
+ FILTER_VER_LUMA_SS 32, 32
+ FILTER_VER_LUMA_SS 32, 16
+ FILTER_VER_LUMA_SS 16, 32
+ FILTER_VER_LUMA_SS 32, 24
+ FILTER_VER_LUMA_SS 24, 32
+ FILTER_VER_LUMA_SS 32, 8
+ FILTER_VER_LUMA_SS 8, 32
+ FILTER_VER_LUMA_SS 64, 64
+ FILTER_VER_LUMA_SS 64, 32
+ FILTER_VER_LUMA_SS 32, 64
+ FILTER_VER_LUMA_SS 64, 48
+ FILTER_VER_LUMA_SS 48, 64
+ FILTER_VER_LUMA_SS 64, 16
+ FILTER_VER_LUMA_SS 16, 64
More information about the x265-devel
mailing list