<div dir="ltr">Please ignore the patch, I will clean and resend it.<div><br></div><div>Regards,</div><div>Divya</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Nov 11, 2014 at 2:13 PM, <span dir="ltr"><<a href="mailto:divya@multicorewareinc.com" target="_blank">divya@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Divya Manivannan<br>
# Date 1415694311 -19800<br>
# Tue Nov 11 13:55:11 2014 +0530<br>
# Node ID 6adafe6ef2868b28b74c66f1ef82cf3cec6bb2a7<br>
# Parent a4c68926ff170d619c26bb78c7a988fa5ad715db<br>
luma_hpp[8x8, 8x16, 8x32] avx2 asm code: improve 657c->567c, 1192c->1074c, 2602c->2113c<br>
<br>
diff -r a4c68926ff17 -r 6adafe6ef286 source/common/x86/ipfilter8.asm<br>
--- a/source/common/x86/ipfilter8.asm Mon Nov 10 12:28:06 2014 +0530<br>
+++ b/source/common/x86/ipfilter8.asm Tue Nov 11 13:55:11 2014 +0530<br>
@@ -1,862 +1,803 @@<br>
-;*****************************************************************************<br>
-;* Copyright (C) 2013 x265 project<br>
-;*<br>
-;* Authors: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
-;* Nabajit Deka <<a href="mailto:nabajit@multicorewareinc.com">nabajit@multicorewareinc.com</a>><br>
-;* Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
-;*<br>
-;* This program is free software; you can redistribute it and/or modify<br>
-;* it under the terms of the GNU General Public License as published by<br>
-;* the Free Software Foundation; either version 2 of the License, or<br>
-;* (at your option) any later version.<br>
-;*<br>
-;* This program is distributed in the hope that it will be useful,<br>
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
-;* GNU General Public License for more details.<br>
-;*<br>
-;* You should have received a copy of the GNU General Public License<br>
-;* along with this program; if not, write to the Free Software<br>
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
-;*<br>
-;* This program is also available under a commercial proprietary license.<br>
-;* For more information, contact us at license @ <a href="http://x265.com" target="_blank">x265.com</a>.<br>
-;*****************************************************************************/<br>
-<br>
-%include "x86inc.asm"<br>
-%include "x86util.asm"<br>
-<br>
-SECTION_RODATA 32<br>
-tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6<br>
- db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10<br>
- db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14<br>
-<br>
-ALIGN 32<br>
-tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8<br>
- db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10<br>
- db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12<br>
- db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14<br>
-<br>
-tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1<br>
- db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3<br>
-<br>
-tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3<br>
-<br>
-tab_c_526336: times 4 dd 8192*64+2048<br>
-<br>
-tab_ChromaCoeff: db 0, 64, 0, 0<br>
- db -2, 58, 10, -2<br>
- db -4, 54, 16, -2<br>
- db -6, 46, 28, -4<br>
- db -4, 36, 36, -4<br>
- db -4, 28, 46, -6<br>
- db -2, 16, 54, -4<br>
- db -2, 10, 58, -2<br>
-<br>
-tab_ChromaCoeffV: times 4 dw 0, 64<br>
- times 4 dw 0, 0<br>
-<br>
- times 4 dw -2, 58<br>
- times 4 dw 10, -2<br>
-<br>
- times 4 dw -4, 54<br>
- times 4 dw 16, -2<br>
-<br>
- times 4 dw -6, 46<br>
- times 4 dw 28, -4<br>
-<br>
- times 4 dw -4, 36<br>
- times 4 dw 36, -4<br>
-<br>
- times 4 dw -4, 28<br>
- times 4 dw 46, -6<br>
-<br>
- times 4 dw -2, 16<br>
- times 4 dw 54, -4<br>
-<br>
- times 4 dw -2, 10<br>
- times 4 dw 58, -2<br>
-<br>
-tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0<br>
- db -1, 4, -10, 58, 17, -5, 1, 0<br>
- db -1, 4, -11, 40, 40, -11, 4, -1<br>
- db 0, 1, -5, 17, 58, -10, 4, -1<br>
-<br>
-tab_LumaCoeffV: times 4 dw 0, 0<br>
- times 4 dw 0, 64<br>
- times 4 dw 0, 0<br>
- times 4 dw 0, 0<br>
-<br>
- times 4 dw -1, 4<br>
- times 4 dw -10, 58<br>
- times 4 dw 17, -5<br>
- times 4 dw 1, 0<br>
-<br>
- times 4 dw -1, 4<br>
- times 4 dw -11, 40<br>
- times 4 dw 40, -11<br>
- times 4 dw 4, -1<br>
-<br>
- times 4 dw 0, 1<br>
- times 4 dw -5, 17<br>
- times 4 dw 58, -10<br>
- times 4 dw 4, -1<br>
-<br>
-tab_LumaCoeffVer: times 8 db 0, 0<br>
- times 8 db 0, 64<br>
- times 8 db 0, 0<br>
- times 8 db 0, 0<br>
-<br>
- times 8 db -1, 4<br>
- times 8 db -10, 58<br>
- times 8 db 17, -5<br>
- times 8 db 1, 0<br>
-<br>
- times 8 db -1, 4<br>
- times 8 db -11, 40<br>
- times 8 db 40, -11<br>
- times 8 db 4, -1<br>
-<br>
- times 8 db 0, 1<br>
- times 8 db -5, 17<br>
- times 8 db 58, -10<br>
- times 8 db 4, -1<br>
-<br>
-tab_c_64_n64: times 8 db 64, -64<br>
-<br>
-shuf1: times 2 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15<br>
-<br>
-SECTION .text<br>
-<br>
-cextern idct4_shuf1<br>
-cextern pb_128<br>
-cextern pw_1<br>
-cextern pw_512<br>
-cextern pw_2000<br>
-<br>
-%macro FILTER_H4_w2_2 3<br>
- movh %2, [srcq - 1]<br>
- pshufb %2, %2, Tm0<br>
- movh %1, [srcq + srcstrideq - 1]<br>
- pshufb %1, %1, Tm0<br>
- punpcklqdq %2, %1<br>
- pmaddubsw %2, coef2<br>
- phaddw %2, %2<br>
- pmulhrsw %2, %3<br>
- packuswb %2, %2<br>
- movd r4, %2<br>
- mov [dstq], r4w<br>
- shr r4, 16<br>
- mov [dstq + dststrideq], r4w<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-%rep 2<br>
-FILTER_H4_w2_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-%endrep<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-%rep 4<br>
-FILTER_H4_w2_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-%endrep<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-mov r5d, 16/2<br>
-<br>
-.loop:<br>
-FILTER_H4_w2_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-dec r5d<br>
-jnz .loop<br>
-<br>
-RET<br>
-<br>
-%macro FILTER_H4_w4_2 3<br>
- movh %2, [srcq - 1]<br>
- pshufb %2, %2, Tm0<br>
- pmaddubsw %2, coef2<br>
- movh %1, [srcq + srcstrideq - 1]<br>
- pshufb %1, %1, Tm0<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- pmulhrsw %2, %3<br>
- packuswb %2, %2<br>
- movd [dstq], %2<br>
- palignr %2, %2, 4<br>
- movd [dstq + dststrideq], %2<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-FILTER_H4_w4_2 t0, t1, t2<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-%rep 2<br>
-FILTER_H4_w4_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-%endrep<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-%rep 4<br>
-FILTER_H4_w4_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-%endrep<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-%rep 8<br>
-FILTER_H4_w4_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-%endrep<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride<br>
-%define coef2 m4<br>
-%define Tm0 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-<br>
-mov r5d, 32/2<br>
-<br>
-.loop:<br>
-FILTER_H4_w4_2 t0, t1, t2<br>
-lea srcq, [srcq + srcstrideq * 2]<br>
-lea dstq, [dstq + dststrideq * 2]<br>
-dec r5d<br>
-jnz .loop<br>
-<br>
-RET<br>
-<br>
-<br>
-%macro FILTER_H4_w6 3<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- pmulhrsw %2, %3<br>
- packuswb %2, %2<br>
- movd [dstq], %2<br>
- pextrw [dstq + 4], %2, 2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w8 3<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- pmulhrsw %2, %3<br>
- packuswb %2, %2<br>
- movh [dstq], %2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w12 3<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- pmulhrsw %2, %3<br>
- movu %1, [srcq - 1 + 8]<br>
- pshufb %1, %1, Tm0<br>
- pmaddubsw %1, coef2<br>
- phaddw %1, %1<br>
- pmulhrsw %1, %3<br>
- packuswb %2, %1<br>
- movh [dstq], %2<br>
- pextrd [dstq + 8], %2, 2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w16 4<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq - 1 + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- pmulhrsw %2, %3<br>
- pmulhrsw %4, %3<br>
- packuswb %2, %4<br>
- movu [dstq], %2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w24 4<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq - 1 + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- pmulhrsw %2, %3<br>
- pmulhrsw %4, %3<br>
- packuswb %2, %4<br>
- movu [dstq], %2<br>
- movu %1, [srcq - 1 + 16]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- pmulhrsw %2, %3<br>
- packuswb %2, %2<br>
- movh [dstq + 16], %2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w32 4<br>
- movu %1, [srcq - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq - 1 + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- pmulhrsw %2, %3<br>
- pmulhrsw %4, %3<br>
- packuswb %2, %4<br>
- movu [dstq], %2<br>
- movu %1, [srcq - 1 + 16]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq - 1 + 24]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- pmulhrsw %2, %3<br>
- pmulhrsw %4, %3<br>
- packuswb %2, %4<br>
- movu [dstq + 16], %2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w16o 5<br>
- movu %1, [srcq + %5 - 1]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + %5 - 1 + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- pmulhrsw %2, %3<br>
- pmulhrsw %4, %3<br>
- packuswb %2, %4<br>
- movu [dstq + %5], %2<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w48 4<br>
- FILTER_H4_w16o %1, %2, %3, %4, 0<br>
- FILTER_H4_w16o %1, %2, %3, %4, 16<br>
- FILTER_H4_w16o %1, %2, %3, %4, 32<br>
-%endmacro<br>
-<br>
-%macro FILTER_H4_w64 4<br>
- FILTER_H4_w16o %1, %2, %3, %4, 0<br>
- FILTER_H4_w16o %1, %2, %3, %4, 16<br>
- FILTER_H4_w16o %1, %2, %3, %4, 32<br>
- FILTER_H4_w16o %1, %2, %3, %4, 48<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro IPFILTER_CHROMA 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride<br>
-%define coef2 m5<br>
-%define Tm0 m4<br>
-%define Tm1 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-mov r5d, %2<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-mova Tm1, [tab_Tm + 16]<br>
-<br>
-.loop:<br>
-FILTER_H4_w%1 t0, t1, t2<br>
-add srcq, srcstrideq<br>
-add dstq, dststrideq<br>
-<br>
-dec r5d<br>
-jnz .loop<br>
-<br>
-RET<br>
-%endmacro<br>
-<br>
-<br>
-IPFILTER_CHROMA 6, 8<br>
-IPFILTER_CHROMA 8, 2<br>
-IPFILTER_CHROMA 8, 4<br>
-IPFILTER_CHROMA 8, 6<br>
-IPFILTER_CHROMA 8, 8<br>
-IPFILTER_CHROMA 8, 16<br>
-IPFILTER_CHROMA 8, 32<br>
-IPFILTER_CHROMA 12, 16<br>
-<br>
-IPFILTER_CHROMA 6, 16<br>
-IPFILTER_CHROMA 8, 12<br>
-IPFILTER_CHROMA 8, 64<br>
-IPFILTER_CHROMA 12, 32<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro IPFILTER_CHROMA_W 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride<br>
-%define coef2 m6<br>
-%define Tm0 m5<br>
-%define Tm1 m4<br>
-%define t3 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
-mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd coef2, [r5 + r4 * 4]<br>
-%else<br>
-movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-mov r5d, %2<br>
-<br>
-pshufd coef2, coef2, 0<br>
-mova t2, [pw_512]<br>
-mova Tm0, [tab_Tm]<br>
-mova Tm1, [tab_Tm + 16]<br>
-<br>
-.loop:<br>
-FILTER_H4_w%1 t0, t1, t2, t3<br>
-add srcq, srcstrideq<br>
-add dstq, dststrideq<br>
-<br>
-dec r5d<br>
-jnz .loop<br>
-<br>
-RET<br>
-%endmacro<br>
-<br>
-IPFILTER_CHROMA_W 16, 4<br>
-IPFILTER_CHROMA_W 16, 8<br>
-IPFILTER_CHROMA_W 16, 12<br>
-IPFILTER_CHROMA_W 16, 16<br>
-IPFILTER_CHROMA_W 16, 32<br>
-IPFILTER_CHROMA_W 32, 8<br>
-IPFILTER_CHROMA_W 32, 16<br>
-IPFILTER_CHROMA_W 32, 24<br>
-IPFILTER_CHROMA_W 24, 32<br>
-IPFILTER_CHROMA_W 32, 32<br>
-<br>
-IPFILTER_CHROMA_W 16, 24<br>
-IPFILTER_CHROMA_W 16, 64<br>
-IPFILTER_CHROMA_W 32, 48<br>
-IPFILTER_CHROMA_W 24, 64<br>
-IPFILTER_CHROMA_W 32, 64<br>
-<br>
-IPFILTER_CHROMA_W 64, 64<br>
-IPFILTER_CHROMA_W 64, 32<br>
-IPFILTER_CHROMA_W 64, 48<br>
-IPFILTER_CHROMA_W 48, 64<br>
-IPFILTER_CHROMA_W 64, 16<br>
-<br>
-<br>
-%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst<br>
- movu %1, %7<br>
- pshufb %2, %1, [tab_Lm + 0]<br>
- pmaddubsw %2, %5<br>
- pshufb %3, %1, [tab_Lm + 16]<br>
- pmaddubsw %3, %5<br>
- phaddw %2, %3<br>
- pshufb %4, %1, [tab_Lm + 32]<br>
- pmaddubsw %4, %5<br>
- pshufb %1, %1, [tab_Lm + 48]<br>
- pmaddubsw %1, %5<br>
- phaddw %4, %1<br>
- phaddw %2, %4<br>
- %if %0 == 8<br>
- pmulhrsw %2, %6<br>
- packuswb %2, %2<br>
- movh %8, %2<br>
- %endif<br>
-%endmacro<br>
-<br>
-%macro FILTER_H8_W4 2<br>
- movu %1, [r0 - 3 + r5]<br>
- pshufb %2, %1, [tab_Lm]<br>
- pmaddubsw %2, m3<br>
- pshufb m7, %1, [tab_Lm + 16]<br>
- pmaddubsw m7, m3<br>
- phaddw %2, m7<br>
- phaddw %2, %2<br>
-%endmacro<br>
-<br>
-;----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;----------------------------------------------------------------------------------------------------------------------------<br>
-%macro IPFILTER_LUMA 3<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8<br>
-<br>
- mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_LumaCoeff]<br>
- movh m3, [r6 + r4 * 8]<br>
-%else<br>
- movh m3, [tab_LumaCoeff + r4 * 8]<br>
-%endif<br>
- punpcklqdq m3, m3<br>
-<br>
-%ifidn %3, pp<br>
- mova m2, [pw_512]<br>
-%else<br>
- mova m2, [pw_2000]<br>
-%endif<br>
-<br>
- mov r4d, %2<br>
-%ifidn %3, ps<br>
- add r3, r3<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- lea r6, [r1 + 2 * r1]<br>
- sub r0, r6<br>
- add r4d, 7<br>
-%endif<br>
-<br>
-.loopH:<br>
- xor r5, r5<br>
-%rep %1 / 8<br>
- %ifidn %3, pp<br>
- FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]<br>
- %else<br>
- FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]<br>
- psubw m1, m2<br>
- movu [r2 + 2 * r5], m1<br>
- %endif<br>
- add r5, 8<br>
-%endrep<br>
-<br>
-%rep (%1 % 8) / 4<br>
- FILTER_H8_W4 m0, m1<br>
- %ifidn %3, pp<br>
- pmulhrsw m1, m2<br>
- packuswb m1, m1<br>
- movd [r2 + r5], m1<br>
- %else<br>
- psubw m1, m2<br>
- movh [r2 + 2 * r5], m1<br>
- %endif<br>
-%endrep<br>
-<br>
- add r0, r1<br>
- add r2, r3<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
- RET<br>
-%endmacro<br>
-<br>
-<br>
-INIT_YMM avx2<br>
-cglobal interp_8tap_horiz_pp_4x4, 4,6,6<br>
- mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeff]<br>
- vpbroadcastq m0, [r5 + r4 * 8]<br>
-%else<br>
- vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]<br>
-%endif<br>
-<br>
- mova m1, [tab_Lm]<br>
- vpbroadcastd m2, [pw_1]<br>
-<br>
- ; register map<br>
- ; m0 - interpolate coeff<br>
- ; m1 - shuffle order table<br>
- ; m2 - constant word 1<br>
-<br>
- sub r0, 3<br>
- ; Row 0-1<br>
- vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
- pshufb m3, m1<br>
- pmaddubsw m3, m0<br>
- pmaddwd m3, m2<br>
- vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
- pshufb m4, m1<br>
- pmaddubsw m4, m0<br>
- pmaddwd m4, m2<br>
- phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]<br>
-<br>
- ; Row 2-3<br>
- lea r0, [r0 + r1 * 2]<br>
- vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
- pshufb m4, m1<br>
- pmaddubsw m4, m0<br>
- pmaddwd m4, m2<br>
- vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
- pshufb m5, m1<br>
- pmaddubsw m5, m0<br>
- pmaddwd m5, m2<br>
- phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]<br>
-<br>
- packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]<br>
- pmulhrsw m3, [pw_512]<br>
- vextracti128 xm4, m3, 1<br>
- packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]<br>
- pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]<br>
-<br>
- lea r0, [r3 * 3]<br>
- movd [r2], xm3<br>
- pextrd [r2+r3], xm3, 2<br>
- pextrd [r2+r3*2], xm3, 1<br>
- pextrd [r2+r0], xm3, 3<br>
- RET<br>
-<br>
-%macro IPFILTER_LUMA_AVX2 2<br>
+;*****************************************************************************<br>
+;* Copyright (C) 2013 x265 project<br>
+;*<br>
+;* Authors: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
+;* Nabajit Deka <<a href="mailto:nabajit@multicorewareinc.com">nabajit@multicorewareinc.com</a>><br>
+;* Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
+;*<br>
+;* This program is free software; you can redistribute it and/or modify<br>
+;* it under the terms of the GNU General Public License as published by<br>
+;* the Free Software Foundation; either version 2 of the License, or<br>
+;* (at your option) any later version.<br>
+;*<br>
+;* This program is distributed in the hope that it will be useful,<br>
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+;* GNU General Public License for more details.<br>
+;*<br>
+;* You should have received a copy of the GNU General Public License<br>
+;* along with this program; if not, write to the Free Software<br>
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+;*<br>
+;* This program is also available under a commercial proprietary license.<br>
+;* For more information, contact us at license @ <a href="http://x265.com" target="_blank">x265.com</a>.<br>
+;*****************************************************************************/<br>
+<br>
+%include "x86inc.asm"<br>
+%include "x86util.asm"<br>
+<br>
+SECTION_RODATA 32<br>
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6<br>
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10<br>
+ db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14<br>
+<br>
+ALIGN 32<br>
+tab_Lm: db 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8<br>
+ db 2, 3, 4, 5, 6, 7, 8, 9, 3, 4, 5, 6, 7, 8, 9, 10<br>
+ db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12<br>
+ db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14<br>
+<br>
+tab_Vm: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1<br>
+ db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3<br>
+<br>
+tab_Cm: db 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3, 0, 2, 1, 3<br>
+<br>
+tab_c_526336: times 4 dd 8192*64+2048<br>
+<br>
+tab_ChromaCoeff: db 0, 64, 0, 0<br>
+ db -2, 58, 10, -2<br>
+ db -4, 54, 16, -2<br>
+ db -6, 46, 28, -4<br>
+ db -4, 36, 36, -4<br>
+ db -4, 28, 46, -6<br>
+ db -2, 16, 54, -4<br>
+ db -2, 10, 58, -2<br>
+<br>
+tab_ChromaCoeffV: times 4 dw 0, 64<br>
+ times 4 dw 0, 0<br>
+<br>
+ times 4 dw -2, 58<br>
+ times 4 dw 10, -2<br>
+<br>
+ times 4 dw -4, 54<br>
+ times 4 dw 16, -2<br>
+<br>
+ times 4 dw -6, 46<br>
+ times 4 dw 28, -4<br>
+<br>
+ times 4 dw -4, 36<br>
+ times 4 dw 36, -4<br>
+<br>
+ times 4 dw -4, 28<br>
+ times 4 dw 46, -6<br>
+<br>
+ times 4 dw -2, 16<br>
+ times 4 dw 54, -4<br>
+<br>
+ times 4 dw -2, 10<br>
+ times 4 dw 58, -2<br>
+<br>
+tab_LumaCoeff: db 0, 0, 0, 64, 0, 0, 0, 0<br>
+ db -1, 4, -10, 58, 17, -5, 1, 0<br>
+ db -1, 4, -11, 40, 40, -11, 4, -1<br>
+ db 0, 1, -5, 17, 58, -10, 4, -1<br>
+<br>
+tab_LumaCoeffV: times 4 dw 0, 0<br>
+ times 4 dw 0, 64<br>
+ times 4 dw 0, 0<br>
+ times 4 dw 0, 0<br>
+<br>
+ times 4 dw -1, 4<br>
+ times 4 dw -10, 58<br>
+ times 4 dw 17, -5<br>
+ times 4 dw 1, 0<br>
+<br>
+ times 4 dw -1, 4<br>
+ times 4 dw -11, 40<br>
+ times 4 dw 40, -11<br>
+ times 4 dw 4, -1<br>
+<br>
+ times 4 dw 0, 1<br>
+ times 4 dw -5, 17<br>
+ times 4 dw 58, -10<br>
+ times 4 dw 4, -1<br>
+<br>
+tab_LumaCoeffVer: times 8 db 0, 0<br>
+ times 8 db 0, 64<br>
+ times 8 db 0, 0<br>
+ times 8 db 0, 0<br>
+<br>
+ times 8 db -1, 4<br>
+ times 8 db -10, 58<br>
+ times 8 db 17, -5<br>
+ times 8 db 1, 0<br>
+<br>
+ times 8 db -1, 4<br>
+ times 8 db -11, 40<br>
+ times 8 db 40, -11<br>
+ times 8 db 4, -1<br>
+<br>
+ times 8 db 0, 1<br>
+ times 8 db -5, 17<br>
+ times 8 db 58, -10<br>
+ times 8 db 4, -1<br>
+<br>
+tab_c_64_n64: times 8 db 64, -64<br>
+<br>
+SECTION .text<br>
+<br>
+cextern idct4_shuf1<br>
+cextern pb_128<br>
+cextern pw_1<br>
+cextern pw_512<br>
+cextern pw_2000<br>
+<br>
+%macro FILTER_H4_w2_2 3<br>
+ movh %2, [srcq - 1]<br>
+ pshufb %2, %2, Tm0<br>
+ movh %1, [srcq + srcstrideq - 1]<br>
+ pshufb %1, %1, Tm0<br>
+ punpcklqdq %2, %1<br>
+ pmaddubsw %2, coef2<br>
+ phaddw %2, %2<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movd r4, %2<br>
+ mov [dstq], r4w<br>
+ shr r4, 16<br>
+ mov [dstq + dststrideq], r4w<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_2x4, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+%rep 2<br>
+FILTER_H4_w2_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+%endrep<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_2x8, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+%rep 4<br>
+FILTER_H4_w2_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+%endrep<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_2x16, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+mov r5d, 16/2<br>
+<br>
+.loop:<br>
+FILTER_H4_w2_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+dec r5d<br>
+jnz .loop<br>
+<br>
+RET<br>
+<br>
+%macro FILTER_H4_w4_2 3<br>
+ movh %2, [srcq - 1]<br>
+ pshufb %2, %2, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ movh %1, [srcq + srcstrideq - 1]<br>
+ pshufb %1, %1, Tm0<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movd [dstq], %2<br>
+ palignr %2, %2, 4<br>
+ movd [dstq + dststrideq], %2<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_4x2, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+FILTER_H4_w4_2 t0, t1, t2<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_4x4, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+%rep 2<br>
+FILTER_H4_w4_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+%endrep<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_4x8, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+%rep 4<br>
+FILTER_H4_w4_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+%endrep<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_4x16, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+%rep 8<br>
+FILTER_H4_w4_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+%endrep<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_4x32, 4, 6, 5, src, srcstride, dst, dststride<br>
+%define coef2 m4<br>
+%define Tm0 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+<br>
+mov r5d, 32/2<br>
+<br>
+.loop:<br>
+FILTER_H4_w4_2 t0, t1, t2<br>
+lea srcq, [srcq + srcstrideq * 2]<br>
+lea dstq, [dstq + dststrideq * 2]<br>
+dec r5d<br>
+jnz .loop<br>
+<br>
+RET<br>
+<br>
+<br>
+%macro FILTER_H4_w6 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movd [dstq], %2<br>
+ pextrw [dstq + 4], %2, 2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w8 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movh [dstq], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w12 3<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %1, %1, Tm0<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %1, %1<br>
+ pmulhrsw %1, %3<br>
+ packuswb %2, %1<br>
+ movh [dstq], %2<br>
+ pextrd [dstq + 8], %2, 2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w16 4<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w24 4<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq], %2<br>
+ movu %1, [srcq - 1 + 16]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ pmulhrsw %2, %3<br>
+ packuswb %2, %2<br>
+ movh [dstq + 16], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w32 4<br>
+ movu %1, [srcq - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq], %2<br>
+ movu %1, [srcq - 1 + 16]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq - 1 + 24]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq + 16], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w16o 5<br>
+ movu %1, [srcq + %5 - 1]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + %5 - 1 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ pmulhrsw %2, %3<br>
+ pmulhrsw %4, %3<br>
+ packuswb %2, %4<br>
+ movu [dstq + %5], %2<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w48 4<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 0<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 16<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 32<br>
+%endmacro<br>
+<br>
+%macro FILTER_H4_w64 4<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 0<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 16<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 32<br>
+ FILTER_H4_w16o %1, %2, %3, %4, 48<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro IPFILTER_CHROMA 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 6, src, srcstride, dst, dststride<br>
+%define coef2 m5<br>
+%define Tm0 m4<br>
+%define Tm1 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+mov r5d, %2<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+mova Tm1, [tab_Tm + 16]<br>
+<br>
+.loop:<br>
+FILTER_H4_w%1 t0, t1, t2<br>
+add srcq, srcstrideq<br>
+add dstq, dststrideq<br>
+<br>
+dec r5d<br>
+jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
+<br>
+<br>
+IPFILTER_CHROMA 6, 8<br>
+IPFILTER_CHROMA 8, 2<br>
+IPFILTER_CHROMA 8, 4<br>
+IPFILTER_CHROMA 8, 6<br>
+IPFILTER_CHROMA 8, 8<br>
+IPFILTER_CHROMA 8, 16<br>
+IPFILTER_CHROMA 8, 32<br>
+IPFILTER_CHROMA 12, 16<br>
+<br>
+IPFILTER_CHROMA 6, 16<br>
+IPFILTER_CHROMA 8, 12<br>
+IPFILTER_CHROMA 8, 64<br>
+IPFILTER_CHROMA 12, 32<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro IPFILTER_CHROMA_W 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 7, src, srcstride, dst, dststride<br>
+%define coef2 m6<br>
+%define Tm0 m5<br>
+%define Tm1 m4<br>
+%define t3 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd coef2, [r5 + r4 * 4]<br>
+%else<br>
+movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+mov r5d, %2<br>
+<br>
+pshufd coef2, coef2, 0<br>
+mova t2, [pw_512]<br>
+mova Tm0, [tab_Tm]<br>
+mova Tm1, [tab_Tm + 16]<br>
+<br>
+.loop:<br>
+FILTER_H4_w%1 t0, t1, t2, t3<br>
+add srcq, srcstrideq<br>
+add dstq, dststrideq<br>
+<br>
+dec r5d<br>
+jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
+<br>
+IPFILTER_CHROMA_W 16, 4<br>
+IPFILTER_CHROMA_W 16, 8<br>
+IPFILTER_CHROMA_W 16, 12<br>
+IPFILTER_CHROMA_W 16, 16<br>
+IPFILTER_CHROMA_W 16, 32<br>
+IPFILTER_CHROMA_W 32, 8<br>
+IPFILTER_CHROMA_W 32, 16<br>
+IPFILTER_CHROMA_W 32, 24<br>
+IPFILTER_CHROMA_W 24, 32<br>
+IPFILTER_CHROMA_W 32, 32<br>
+<br>
+IPFILTER_CHROMA_W 16, 24<br>
+IPFILTER_CHROMA_W 16, 64<br>
+IPFILTER_CHROMA_W 32, 48<br>
+IPFILTER_CHROMA_W 24, 64<br>
+IPFILTER_CHROMA_W 32, 64<br>
+<br>
+IPFILTER_CHROMA_W 64, 64<br>
+IPFILTER_CHROMA_W 64, 32<br>
+IPFILTER_CHROMA_W 64, 48<br>
+IPFILTER_CHROMA_W 48, 64<br>
+IPFILTER_CHROMA_W 64, 16<br>
+<br>
+<br>
+%macro FILTER_H8_W8 7-8 ; t0, t1, t2, t3, coef, c512, src, dst<br>
+ movu %1, %7<br>
+ pshufb %2, %1, [tab_Lm + 0]<br>
+ pmaddubsw %2, %5<br>
+ pshufb %3, %1, [tab_Lm + 16]<br>
+ pmaddubsw %3, %5<br>
+ phaddw %2, %3<br>
+ pshufb %4, %1, [tab_Lm + 32]<br>
+ pmaddubsw %4, %5<br>
+ pshufb %1, %1, [tab_Lm + 48]<br>
+ pmaddubsw %1, %5<br>
+ phaddw %4, %1<br>
+ phaddw %2, %4<br>
+ %if %0 == 8<br>
+ pmulhrsw %2, %6<br>
+ packuswb %2, %2<br>
+ movh %8, %2<br>
+ %endif<br>
+%endmacro<br>
+<br>
+%macro FILTER_H8_W4 2<br>
+ movu %1, [r0 - 3 + r5]<br>
+ pshufb %2, %1, [tab_Lm]<br>
+ pmaddubsw %2, m3<br>
+ pshufb m7, %1, [tab_Lm + 16]<br>
+ pmaddubsw m7, m3<br>
+ phaddw %2, m7<br>
+ phaddw %2, %2<br>
+%endmacro<br>
+<br>
+;----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;----------------------------------------------------------------------------------------------------------------------------<br>
+%macro IPFILTER_LUMA 3<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8<br>
+<br>
+ mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_LumaCoeff]<br>
+ movh m3, [r6 + r4 * 8]<br>
+%else<br>
+ movh m3, [tab_LumaCoeff + r4 * 8]<br>
+%endif<br>
+ punpcklqdq m3, m3<br>
+<br>
+%ifidn %3, pp<br>
+ mova m2, [pw_512]<br>
+%else<br>
+ mova m2, [pw_2000]<br>
+%endif<br>
+<br>
+ mov r4d, %2<br>
+%ifidn %3, ps<br>
+ add r3, r3<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ lea r6, [r1 + 2 * r1]<br>
+ sub r0, r6<br>
+ add r4d, 7<br>
+%endif<br>
+<br>
+.loopH:<br>
+ xor r5, r5<br>
+%rep %1 / 8<br>
+ %ifidn %3, pp<br>
+ FILTER_H8_W8 m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]<br>
+ %else<br>
+ FILTER_H8_W8 m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]<br>
+ psubw m1, m2<br>
+ movu [r2 + 2 * r5], m1<br>
+ %endif<br>
+ add r5, 8<br>
+%endrep<br>
+<br>
+%rep (%1 % 8) / 4<br>
+ FILTER_H8_W4 m0, m1<br>
+ %ifidn %3, pp<br>
+ pmulhrsw m1, m2<br>
+ packuswb m1, m1<br>
+ movd [r2 + r5], m1<br>
+ %else<br>
+ psubw m1, m2<br>
+ movh [r2 + 2 * r5], m1<br>
+ %endif<br>
+%endrep<br>
+<br>
+ add r0, r1<br>
+ add r2, r3<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+ RET<br>
+%endmacro<br>
+<br>
+<br>
INIT_YMM avx2<br>
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,6<br>
+cglobal interp_8tap_horiz_pp_4x4, 4,6,6<br>
mov r4d, r4m<br>
<br>
%ifdef PIC<br>
@@ -867,6 +808,63 @@<br>
%endif<br>
<br>
mova m1, [tab_Lm]<br>
+ vpbroadcastd m2, [pw_1]<br>
+<br>
+ ; register map<br>
+ ; m0 - interpolate coeff<br>
+ ; m1 - shuffle order table<br>
+ ; m2 - constant word 1<br>
+<br>
+ sub r0, 3<br>
+ ; Row 0-1<br>
+ vbroadcasti128 m3, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m3, m1<br>
+ pmaddubsw m3, m0<br>
+ pmaddwd m3, m2<br>
+ vbroadcasti128 m4, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m4, m1<br>
+ pmaddubsw m4, m0<br>
+ pmaddwd m4, m2<br>
+ phaddd m3, m4 ; DWORD [R1D R1C R0D R0C R1B R1A R0B R0A]<br>
+<br>
+ ; Row 2-3<br>
+ lea r0, [r0 + r1 * 2]<br>
+ vbroadcasti128 m4, [r0] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m4, m1<br>
+ pmaddubsw m4, m0<br>
+ pmaddwd m4, m2<br>
+ vbroadcasti128 m5, [r0 + r1] ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m5, m1<br>
+ pmaddubsw m5, m0<br>
+ pmaddwd m5, m2<br>
+ phaddd m4, m5 ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]<br>
+<br>
+ packssdw m3, m4 ; WORD [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]<br>
+ pmulhrsw m3, [pw_512]<br>
+ vextracti128 xm4, m3, 1<br>
+ packuswb xm3, xm4 ; BYTE [R3D R3C R2D R2C R1D R1C R0D R0C R3B R3A R2B R2A R1B R1A R0B R0A]<br>
+ pshufb xm3, [idct4_shuf1] ; [row3 row1 row2 row0]<br>
+<br>
+ lea r0, [r3 * 3]<br>
+ movd [r2], xm3<br>
+ pextrd [r2+r3], xm3, 2<br>
+ pextrd [r2+r3*2], xm3, 1<br>
+ pextrd [r2+r0], xm3, 3<br>
+ RET<br>
+<br>
+%macro IPFILTER_LUMA_AVX2 2<br>
+INIT_YMM avx2<br>
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7<br>
+ mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeff]<br>
+ vpbroadcastq m0, [r5 + r4 * 8]<br>
+%else<br>
+ vpbroadcastq m0, [tab_LumaCoeff + r4 * 8]<br>
+%endif<br>
+<br>
+ mova m1, [tab_Lm]<br>
mova m2, [tab_Lm + 32]<br>
<br>
; register map<br>
@@ -874,7 +872,7 @@<br>
; m1, m2 - shuffle order table<br>
<br>
sub r0, 3<br>
- mov r4, %2 / 2<br>
+ mov r4d, %2 / 4<br>
.loop:<br>
; Row 0<br>
vbroadcasti128 m3, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]<br>
@@ -893,4844 +891,4870 @@<br>
<br>
phaddw m3, m4 ; WORD [R1H R1G R1D R1C R0H R0G R0D R0C R1F R1E R1B R1A R0F R0E R0B R0A]<br>
pmulhrsw m3, [pw_512]<br>
+<br>
+ ; Row 2<br>
+ lea r0, [r0 + r1 * 2]<br>
+ vbroadcasti128 m4, [r0] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m5, m4, m2<br>
+ pshufb m4, m1<br>
+ pmaddubsw m4, m0<br>
+ pmaddubsw m5, m0<br>
+ phaddw m4, m5<br>
+ ; Row 3<br>
+ vbroadcasti128 m5, [r0 + r1] ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m6, m5, m2<br>
+ pshufb m5, m1<br>
+ pmaddubsw m5, m0<br>
+ pmaddubsw m6, m0<br>
+ phaddw m5, m6<br>
+<br>
+ phaddw m4, m5 ; WORD [R3H R3G R3D R3C R2H R2G R2D R2C R3F R3E R3B R3A R2F R2E R2B R2A]<br>
+ pmulhrsw m4, [pw_512]<br>
+<br>
+ packuswb m3, m4<br>
vextracti128 xm4, m3, 1<br>
- packuswb xm3, xm4<br>
- pshufb xm3, [shuf1]<br>
-<br>
- movq [r2], xm3<br>
- movhps [r2 + r3], xm3<br>
-<br>
+ punpcklwd xm5, xm3, xm4<br>
+<br>
+ movq [r2], xm5<br>
+ movhps [r2 + r3], xm5<br>
+<br>
+ punpckhwd xm5, xm3, xm4<br>
lea r2, [r2 + r3 * 2]<br>
+<br>
+ movq [r2], xm5<br>
+ movhps [r2 + r3], xm5<br>
+<br>
lea r0, [r0 + r1 * 2]<br>
- dec r4<br>
+ lea r2, [r2 + r3 * 2]<br>
+ dec r4d<br>
jnz .loop<br>
RET<br>
%endmacro<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
- IPFILTER_LUMA 4, 4, pp<br>
- IPFILTER_LUMA 4, 8, pp<br>
- IPFILTER_LUMA 12, 16, pp<br>
- IPFILTER_LUMA 4, 16, pp<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+ IPFILTER_LUMA 4, 4, pp<br>
+ IPFILTER_LUMA 4, 8, pp<br>
+ IPFILTER_LUMA 12, 16, pp<br>
+ IPFILTER_LUMA 4, 16, pp<br>
IPFILTER_LUMA_AVX2 8, 8<br>
IPFILTER_LUMA_AVX2 8, 16<br>
IPFILTER_LUMA_AVX2 8, 32<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro IPFILTER_LUMA_PP_W8 2<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7<br>
- mov r4d, r4m<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeff]<br>
- movh m3, [r5 + r4 * 8]<br>
-%else<br>
- movh m3, [tab_LumaCoeff + r4 * 8]<br>
-%endif<br>
- pshufd m0, m3, 0 ; m0 = coeff-L<br>
- pshufd m1, m3, 0x55 ; m1 = coeff-H<br>
- lea r5, [tab_Tm] ; r5 = shuffle<br>
- mova m2, [pw_512] ; m2 = 512<br>
-<br>
- mov r4d, %2<br>
-.loopH:<br>
-%assign x 0<br>
-%rep %1 / 8<br>
- movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]<br>
- pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]<br>
- pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]<br>
- pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]<br>
- pmaddubsw m4, m0<br>
- pmaddubsw m6, m5, m1<br>
- pmaddubsw m5, m0<br>
- pmaddubsw m3, m1<br>
- paddw m4, m6<br>
- paddw m5, m3<br>
- phaddw m4, m5<br>
- pmulhrsw m4, m2<br>
- packuswb m4, m4<br>
- movh [r2 + x], m4<br>
-%assign x x+8<br>
-%endrep<br>
-<br>
- add r0, r1<br>
- add r2, r3<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
- RET<br>
-%endmacro<br>
-<br>
-IPFILTER_LUMA_PP_W8 8, 4<br>
-IPFILTER_LUMA_PP_W8 8, 8<br>
-IPFILTER_LUMA_PP_W8 8, 16<br>
-IPFILTER_LUMA_PP_W8 8, 32<br>
-IPFILTER_LUMA_PP_W8 16, 4<br>
-IPFILTER_LUMA_PP_W8 16, 8<br>
-IPFILTER_LUMA_PP_W8 16, 12<br>
-IPFILTER_LUMA_PP_W8 16, 16<br>
-IPFILTER_LUMA_PP_W8 16, 32<br>
-IPFILTER_LUMA_PP_W8 16, 64<br>
-IPFILTER_LUMA_PP_W8 24, 32<br>
-IPFILTER_LUMA_PP_W8 32, 8<br>
-IPFILTER_LUMA_PP_W8 32, 16<br>
-IPFILTER_LUMA_PP_W8 32, 24<br>
-IPFILTER_LUMA_PP_W8 32, 32<br>
-IPFILTER_LUMA_PP_W8 32, 64<br>
-IPFILTER_LUMA_PP_W8 48, 64<br>
-IPFILTER_LUMA_PP_W8 64, 16<br>
-IPFILTER_LUMA_PP_W8 64, 32<br>
-IPFILTER_LUMA_PP_W8 64, 48<br>
-IPFILTER_LUMA_PP_W8 64, 64<br>
-<br>
-;----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;----------------------------------------------------------------------------------------------------------------------------<br>
- IPFILTER_LUMA 4, 4, ps<br>
- IPFILTER_LUMA 8, 8, ps<br>
- IPFILTER_LUMA 8, 4, ps<br>
- IPFILTER_LUMA 4, 8, ps<br>
- IPFILTER_LUMA 16, 16, ps<br>
- IPFILTER_LUMA 16, 8, ps<br>
- IPFILTER_LUMA 8, 16, ps<br>
- IPFILTER_LUMA 16, 12, ps<br>
- IPFILTER_LUMA 12, 16, ps<br>
- IPFILTER_LUMA 16, 4, ps<br>
- IPFILTER_LUMA 4, 16, ps<br>
- IPFILTER_LUMA 32, 32, ps<br>
- IPFILTER_LUMA 32, 16, ps<br>
- IPFILTER_LUMA 16, 32, ps<br>
- IPFILTER_LUMA 32, 24, ps<br>
- IPFILTER_LUMA 24, 32, ps<br>
- IPFILTER_LUMA 32, 8, ps<br>
- IPFILTER_LUMA 8, 32, ps<br>
- IPFILTER_LUMA 64, 64, ps<br>
- IPFILTER_LUMA 64, 32, ps<br>
- IPFILTER_LUMA 32, 64, ps<br>
- IPFILTER_LUMA 64, 48, ps<br>
- IPFILTER_LUMA 48, 64, ps<br>
- IPFILTER_LUMA 64, 16, ps<br>
- IPFILTER_LUMA 16, 64, ps<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; Interpolate HV<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]<br>
- mova %5, [r0 + (%6 + 0) * 16]<br>
- mova %1, [r0 + (%6 + 1) * 16]<br>
- mova %2, [r0 + (%6 + 2) * 16]<br>
- punpcklwd %3, %5, %1<br>
- punpckhwd %5, %1<br>
- pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0<br>
- pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]<br>
- punpcklwd %4, %1, %2<br>
- punpckhwd %1, %2<br>
- pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1<br>
- pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]<br>
-%endmacro ; FILTER_HV8_START<br>
-<br>
-%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]<br>
- mova %8, [r0 + (%9 + 0) * 16]<br>
- mova %1, [r0 + (%9 + 1) * 16]<br>
- punpcklwd %7, %2, %8<br>
- punpckhwd %2, %8<br>
- pmaddwd %7, [r5 + %10 * 16]<br>
- pmaddwd %2, [r5 + %10 * 16]<br>
- paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0<br>
- paddd %5, %2 ; R0 = H[0+1+2+3]<br>
- punpcklwd %7, %8, %1<br>
- punpckhwd %8, %1<br>
- pmaddwd %7, [r5 + %10 * 16]<br>
- pmaddwd %8, [r5 + %10 * 16]<br>
- paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1<br>
- paddd %6, %8 ; R1 = H[1+2+3+4]<br>
-%endmacro ; FILTER_HV8_MID<br>
-<br>
-; Round and Saturate<br>
-%macro FILTER_HV8_END 4 ; output in [1, 3]<br>
- paddd %1, [tab_c_526336]<br>
- paddd %2, [tab_c_526336]<br>
- paddd %3, [tab_c_526336]<br>
- paddd %4, [tab_c_526336]<br>
- psrad %1, 12<br>
- psrad %2, 12<br>
- psrad %3, 12<br>
- psrad %4, 12<br>
- packssdw %1, %2<br>
- packssdw %3, %4<br>
-<br>
- ; TODO: is merge better? I think this way is short dependency link<br>
- packuswb %1, %3<br>
-%endmacro ; FILTER_HV8_END<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM ssse3<br>
-cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16<br>
-%define coef m7<br>
-%define stk_buf rsp<br>
-<br>
- mov r4d, r4m<br>
- mov r5d, r5m<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_LumaCoeff]<br>
- movh coef, [r6 + r4 * 8]<br>
-%else<br>
- movh coef, [tab_LumaCoeff + r4 * 8]<br>
-%endif<br>
- punpcklqdq coef, coef<br>
-<br>
- ; move to row -3<br>
- lea r6, [r1 + r1 * 2]<br>
- sub r0, r6<br>
-<br>
- xor r6, r6<br>
- mov r4, rsp<br>
-<br>
-.loopH:<br>
- FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]<br>
- psubw m1, [pw_2000]<br>
- mova [r4], m1<br>
-<br>
- add r0, r1<br>
- add r4, 16<br>
- inc r6<br>
- cmp r6, 8+7<br>
- jnz .loopH<br>
-<br>
- ; ready to phase V<br>
- ; Here all of mN is free<br>
-<br>
- ; load coeff table<br>
- shl r5, 6<br>
- lea r6, [tab_LumaCoeffV]<br>
- lea r5, [r5 + r6]<br>
-<br>
- ; load intermedia buffer<br>
- mov r0, stk_buf<br>
-<br>
- ; register mapping<br>
- ; r0 - src<br>
- ; r5 - coeff<br>
- ; r6 - loop_i<br>
-<br>
- ; let's go<br>
- xor r6, r6<br>
-<br>
- ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache<br>
-.loopV:<br>
-<br>
- FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0<br>
- FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1<br>
- FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2<br>
- FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3<br>
- FILTER_HV8_END m3, m0, m4, m1<br>
-<br>
- movh [r2], m3<br>
- movhps [r2 + r3], m3<br>
-<br>
- lea r0, [r0 + 16 * 2]<br>
- lea r2, [r2 + r3 * 2]<br>
-<br>
- inc r6<br>
- cmp r6, 8/2<br>
- jnz .loopV<br>
-<br>
- RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_2x4, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-lea r4, [r1 * 3]<br>
-lea r5, [r0 + 4 * r1]<br>
-pshufb m0, [tab_Cm]<br>
-mova m1, [pw_512]<br>
-<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-movd m4, [r0 + 2 * r1]<br>
-movd m5, [r0 + r4]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m6, m4, m5<br>
-punpcklbw m2, m6<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-movd m6, [r5]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m7, m5, m6<br>
-punpcklbw m3, m7<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-pmulhrsw m2, m1<br>
-<br>
-movd m7, [r5 + r1]<br>
-<br>
-punpcklbw m4, m5<br>
-punpcklbw m3, m6, m7<br>
-punpcklbw m4, m3<br>
-<br>
-pmaddubsw m4, m0<br>
-<br>
-movd m3, [r5 + 2 * r1]<br>
-<br>
-punpcklbw m5, m6<br>
-punpcklbw m7, m3<br>
-punpcklbw m5, m7<br>
-<br>
-pmaddubsw m5, m0<br>
-<br>
-phaddw m4, m5<br>
-<br>
-pmulhrsw m4, m1<br>
-packuswb m2, m4<br>
-<br>
-pextrw [r2], m2, 0<br>
-pextrw [r2 + r3], m2, 2<br>
-lea r2, [r2 + 2 * r3]<br>
-pextrw [r2], m2, 4<br>
-pextrw [r2 + r3], m2, 6<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W2_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m0, [tab_Cm]<br>
-<br>
-mova m1, [pw_512]<br>
-<br>
-mov r4d, %2<br>
-lea r5, [3 * r1]<br>
-<br>
-.loop:<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-movd m4, [r0 + 2 * r1]<br>
-movd m5, [r0 + r5]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m6, m4, m5<br>
-punpcklbw m2, m6<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-lea r0, [r0 + 4 * r1]<br>
-movd m6, [r0]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m7, m5, m6<br>
-punpcklbw m3, m7<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-pmulhrsw m2, m1<br>
-<br>
-movd m7, [r0 + r1]<br>
-<br>
-punpcklbw m4, m5<br>
-punpcklbw m3, m6, m7<br>
-punpcklbw m4, m3<br>
-<br>
-pmaddubsw m4, m0<br>
-<br>
-movd m3, [r0 + 2 * r1]<br>
-<br>
-punpcklbw m5, m6<br>
-punpcklbw m7, m3<br>
-punpcklbw m5, m7<br>
-<br>
-pmaddubsw m5, m0<br>
-<br>
-phaddw m4, m5<br>
-<br>
-pmulhrsw m4, m1<br>
-packuswb m2, m4<br>
-<br>
-pextrw [r2], m2, 0<br>
-pextrw [r2 + r3], m2, 2<br>
-lea r2, [r2 + 2 * r3]<br>
-pextrw [r2], m2, 4<br>
-pextrw [r2 + r3], m2, 6<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 4<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W2_H4 2, 8<br>
-<br>
-FILTER_V4_W2_H4 2, 16<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_4x2, 4, 6, 6<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m0, [tab_Cm]<br>
-lea r5, [r0 + 2 * r1]<br>
-<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-movd m4, [r5]<br>
-movd m5, [r5 + r1]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m1, m4, m5<br>
-punpcklbw m2, m1<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-movd m1, [r0 + 4 * r1]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m5, m1<br>
-punpcklbw m3, m5<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-pmulhrsw m2, [pw_512]<br>
-packuswb m2, m2<br>
-movd [r2], m2<br>
-pextrd [r2 + r3], m2, 1<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_4x4, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m0, [tab_Cm]<br>
-mova m1, [pw_512]<br>
-lea r5, [r0 + 4 * r1]<br>
-lea r4, [r1 * 3]<br>
-<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-movd m4, [r0 + 2 * r1]<br>
-movd m5, [r0 + r4]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m6, m4, m5<br>
-punpcklbw m2, m6<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-movd m6, [r5]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m7, m5, m6<br>
-punpcklbw m3, m7<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-pmulhrsw m2, m1<br>
-<br>
-movd m7, [r5 + r1]<br>
-<br>
-punpcklbw m4, m5<br>
-punpcklbw m3, m6, m7<br>
-punpcklbw m4, m3<br>
-<br>
-pmaddubsw m4, m0<br>
-<br>
-movd m3, [r5 + 2 * r1]<br>
-<br>
-punpcklbw m5, m6<br>
-punpcklbw m7, m3<br>
-punpcklbw m5, m7<br>
-<br>
-pmaddubsw m5, m0<br>
-<br>
-phaddw m4, m5<br>
-<br>
-pmulhrsw m4, m1<br>
-<br>
-packuswb m2, m4<br>
-movd [r2], m2<br>
-pextrd [r2 + r3], m2, 1<br>
-lea r2, [r2 + 2 * r3]<br>
-pextrd [r2], m2, 2<br>
-pextrd [r2 + r3], m2, 3<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W4_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m0, [tab_Cm]<br>
-<br>
-mova m1, [pw_512]<br>
-<br>
-mov r4d, %2<br>
-<br>
-lea r5, [3 * r1]<br>
-<br>
-.loop:<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-movd m4, [r0 + 2 * r1]<br>
-movd m5, [r0 + r5]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m6, m4, m5<br>
-punpcklbw m2, m6<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-lea r0, [r0 + 4 * r1]<br>
-movd m6, [r0]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m7, m5, m6<br>
-punpcklbw m3, m7<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-pmulhrsw m2, m1<br>
-<br>
-movd m7, [r0 + r1]<br>
-<br>
-punpcklbw m4, m5<br>
-punpcklbw m3, m6, m7<br>
-punpcklbw m4, m3<br>
-<br>
-pmaddubsw m4, m0<br>
-<br>
-movd m3, [r0 + 2 * r1]<br>
-<br>
-punpcklbw m5, m6<br>
-punpcklbw m7, m3<br>
-punpcklbw m5, m7<br>
-<br>
-pmaddubsw m5, m0<br>
-<br>
-phaddw m4, m5<br>
-<br>
-pmulhrsw m4, m1<br>
-packuswb m2, m4<br>
-movd [r2], m2<br>
-pextrd [r2 + r3], m2, 1<br>
-lea r2, [r2 + 2 * r3]<br>
-pextrd [r2], m2, 2<br>
-pextrd [r2 + r3], m2, 3<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 4<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W4_H4 4, 8<br>
-FILTER_V4_W4_H4 4, 16<br>
-<br>
-FILTER_V4_W4_H4 4, 32<br>
-<br>
-%macro FILTER_V4_W8_H2 0<br>
-punpcklbw m1, m2<br>
-punpcklbw m7, m3, m0<br>
-<br>
-pmaddubsw m1, m6<br>
-pmaddubsw m7, m5<br>
-<br>
-paddw m1, m7<br>
-<br>
-pmulhrsw m1, m4<br>
-packuswb m1, m1<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_H3 0<br>
-punpcklbw m2, m3<br>
-punpcklbw m7, m0, m1<br>
-<br>
-pmaddubsw m2, m6<br>
-pmaddubsw m7, m5<br>
-<br>
-paddw m2, m7<br>
-<br>
-pmulhrsw m2, m4<br>
-packuswb m2, m2<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_H4 0<br>
-punpcklbw m3, m0<br>
-punpcklbw m7, m1, m2<br>
-<br>
-pmaddubsw m3, m6<br>
-pmaddubsw m7, m5<br>
-<br>
-paddw m3, m7<br>
-<br>
-pmulhrsw m3, m4<br>
-packuswb m3, m3<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_H5 0<br>
-punpcklbw m0, m1<br>
-punpcklbw m7, m2, m3<br>
-<br>
-pmaddubsw m0, m6<br>
-pmaddubsw m7, m5<br>
-<br>
-paddw m0, m7<br>
-<br>
-pmulhrsw m0, m4<br>
-packuswb m0, m0<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_8x2 2<br>
-FILTER_V4_W8 %1, %2<br>
-movq m0, [r0 + 4 * r1]<br>
-<br>
-FILTER_V4_W8_H2<br>
-<br>
-movh [r2 + r3], m1<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_8x4 2<br>
-FILTER_V4_W8_8x2 %1, %2<br>
-;8x3<br>
-lea r6, [r0 + 4 * r1]<br>
-movq m1, [r6 + r1]<br>
-<br>
-FILTER_V4_W8_H3<br>
-<br>
-movh [r2 + 2 * r3], m2<br>
-<br>
-;8x4<br>
-movq m2, [r6 + 2 * r1]<br>
-<br>
-FILTER_V4_W8_H4<br>
-<br>
-lea r5, [r2 + 2 * r3]<br>
-movh [r5 + r3], m3<br>
-%endmacro<br>
-<br>
-%macro FILTER_V4_W8_8x6 2<br>
-FILTER_V4_W8_8x4 %1, %2<br>
-;8x5<br>
-lea r6, [r6 + 2 * r1]<br>
-movq m3, [r6 + r1]<br>
-<br>
-FILTER_V4_W8_H5<br>
-<br>
-movh [r2 + 4 * r3], m0<br>
-<br>
-;8x6<br>
-movq m0, [r0 + 8 * r1]<br>
-<br>
-FILTER_V4_W8_H2<br>
-<br>
-lea r5, [r2 + 4 * r3]<br>
-movh [r5 + r3], m1<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W8 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8<br>
-<br>
-mov r4d, r4m<br>
-<br>
-sub r0, r1<br>
-movq m0, [r0]<br>
-movq m1, [r0 + r1]<br>
-movq m2, [r0 + 2 * r1]<br>
-lea r5, [r0 + 2 * r1]<br>
-movq m3, [r5 + r1]<br>
-<br>
-punpcklbw m0, m1<br>
-punpcklbw m4, m2, m3<br>
-<br>
-%ifdef PIC<br>
-lea r6, [tab_ChromaCoeff]<br>
-movd m5, [r6 + r4 * 4]<br>
-%else<br>
-movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m6, m5, [tab_Vm]<br>
-pmaddubsw m0, m6<br>
-<br>
-pshufb m5, [tab_Vm + 16]<br>
-pmaddubsw m4, m5<br>
-<br>
-paddw m0, m4<br>
-<br>
-mova m4, [pw_512]<br>
-<br>
-pmulhrsw m0, m4<br>
-packuswb m0, m0<br>
-movh [r2], m0<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-FILTER_V4_W8_8x2 8, 2<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-FILTER_V4_W8_8x4 8, 4<br>
-<br>
-RET<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-FILTER_V4_W8_8x6 8, 6<br>
-<br>
-RET<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_4x2, 4, 6, 6<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m0, [tab_Cm]<br>
-<br>
-movd m2, [r0]<br>
-movd m3, [r0 + r1]<br>
-lea r5, [r0 + 2 * r1]<br>
-movd m4, [r5]<br>
-movd m5, [r5 + r1]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m1, m4, m5<br>
-punpcklbw m2, m1<br>
-<br>
-pmaddubsw m2, m0<br>
-<br>
-movd m1, [r0 + 4 * r1]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m5, m1<br>
-punpcklbw m3, m5<br>
-<br>
-pmaddubsw m3, m0<br>
-<br>
-phaddw m2, m3<br>
-<br>
-psubw m2, [pw_2000]<br>
-movh [r2], m2<br>
-movhps [r2 + r3], m2<br>
-<br>
-RET<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_4x4, 4, 6, 7<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m0, [tab_Cm]<br>
-<br>
- lea r4, [r1 * 3]<br>
- lea r5, [r0 + 4 * r1]<br>
-<br>
- movd m2, [r0]<br>
- movd m3, [r0 + r1]<br>
- movd m4, [r0 + 2 * r1]<br>
- movd m5, [r0 + r4]<br>
-<br>
- punpcklbw m2, m3<br>
- punpcklbw m6, m4, m5<br>
- punpcklbw m2, m6<br>
-<br>
- pmaddubsw m2, m0<br>
-<br>
- movd m6, [r5]<br>
-<br>
- punpcklbw m3, m4<br>
- punpcklbw m1, m5, m6<br>
- punpcklbw m3, m1<br>
-<br>
- pmaddubsw m3, m0<br>
-<br>
- phaddw m2, m3<br>
-<br>
- mova m1, [pw_2000]<br>
-<br>
- psubw m2, m1<br>
- movh [r2], m2<br>
- movhps [r2 + r3], m2<br>
-<br>
- movd m2, [r5 + r1]<br>
-<br>
- punpcklbw m4, m5<br>
- punpcklbw m3, m6, m2<br>
- punpcklbw m4, m3<br>
-<br>
- pmaddubsw m4, m0<br>
-<br>
- movd m3, [r5 + 2 * r1]<br>
-<br>
- punpcklbw m5, m6<br>
- punpcklbw m2, m3<br>
- punpcklbw m5, m2<br>
-<br>
- pmaddubsw m5, m0<br>
-<br>
- phaddw m4, m5<br>
-<br>
- psubw m4, m1<br>
- lea r2, [r2 + 2 * r3]<br>
- movh [r2], m4<br>
- movhps [r2 + r3], m4<br>
-<br>
- RET<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W4_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m0, [tab_Cm]<br>
-<br>
- mova m1, [pw_2000]<br>
-<br>
- mov r4d, %2/4<br>
- lea r5, [3 * r1]<br>
-<br>
-.loop:<br>
- movd m2, [r0]<br>
- movd m3, [r0 + r1]<br>
- movd m4, [r0 + 2 * r1]<br>
- movd m5, [r0 + r5]<br>
-<br>
- punpcklbw m2, m3<br>
- punpcklbw m6, m4, m5<br>
- punpcklbw m2, m6<br>
-<br>
- pmaddubsw m2, m0<br>
-<br>
- lea r0, [r0 + 4 * r1]<br>
- movd m6, [r0]<br>
-<br>
- punpcklbw m3, m4<br>
- punpcklbw m7, m5, m6<br>
- punpcklbw m3, m7<br>
-<br>
- pmaddubsw m3, m0<br>
-<br>
- phaddw m2, m3<br>
-<br>
- psubw m2, m1<br>
- movh [r2], m2<br>
- movhps [r2 + r3], m2<br>
-<br>
- movd m2, [r0 + r1]<br>
-<br>
- punpcklbw m4, m5<br>
- punpcklbw m3, m6, m2<br>
- punpcklbw m4, m3<br>
-<br>
- pmaddubsw m4, m0<br>
-<br>
- movd m3, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m5, m6<br>
- punpcklbw m2, m3<br>
- punpcklbw m5, m2<br>
-<br>
- pmaddubsw m5, m0<br>
-<br>
- phaddw m4, m5<br>
-<br>
- psubw m4, m1<br>
- lea r2, [r2 + 2 * r3]<br>
- movh [r2], m4<br>
- movhps [r2 + r3], m4<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W4_H4 4, 8<br>
-FILTER_V_PS_W4_H4 4, 16<br>
-<br>
-FILTER_V_PS_W4_H4 4, 32<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W8_H8_H16_H2 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m5, [r5 + r4 * 4]<br>
-%else<br>
- movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m6, m5, [tab_Vm]<br>
- pshufb m5, [tab_Vm + 16]<br>
- mova m4, [pw_2000]<br>
-<br>
- mov r4d, %2/2<br>
- lea r5, [3 * r1]<br>
-<br>
-.loopH:<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- movq m2, [r0 + 2 * r1]<br>
- movq m3, [r0 + r5]<br>
-<br>
- punpcklbw m0, m1<br>
- punpcklbw m1, m2<br>
- punpcklbw m2, m3<br>
-<br>
- pmaddubsw m0, m6<br>
- pmaddubsw m2, m5<br>
-<br>
- paddw m0, m2<br>
-<br>
- psubw m0, m4<br>
- movu [r2], m0<br>
-<br>
- movq m0, [r0 + 4 * r1]<br>
-<br>
- punpcklbw m3, m0<br>
-<br>
- pmaddubsw m1, m6<br>
- pmaddubsw m3, m5<br>
-<br>
- paddw m1, m3<br>
- psubw m1, m4<br>
-<br>
- movu [r2 + r3], m1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W8_H8_H16_H2 8, 2<br>
-FILTER_V_PS_W8_H8_H16_H2 8, 4<br>
-FILTER_V_PS_W8_H8_H16_H2 8, 6<br>
-<br>
-FILTER_V_PS_W8_H8_H16_H2 8, 12<br>
-FILTER_V_PS_W8_H8_H16_H2 8, 64<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W8_H8_H16_H32 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m5, [r5 + r4 * 4]<br>
-%else<br>
- movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m6, m5, [tab_Vm]<br>
- pshufb m5, [tab_Vm + 16]<br>
- mova m4, [pw_2000]<br>
-<br>
- mov r4d, %2/4<br>
- lea r5, [3 * r1]<br>
-<br>
-.loop:<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- movq m2, [r0 + 2 * r1]<br>
- movq m3, [r0 + r5]<br>
-<br>
- punpcklbw m0, m1<br>
- punpcklbw m1, m2<br>
- punpcklbw m2, m3<br>
-<br>
- pmaddubsw m0, m6<br>
- pmaddubsw m7, m2, m5<br>
-<br>
- paddw m0, m7<br>
-<br>
- psubw m0, m4<br>
- movu [r2], m0<br>
-<br>
- lea r0, [r0 + 4 * r1]<br>
- movq m0, [r0]<br>
-<br>
- punpcklbw m3, m0<br>
-<br>
- pmaddubsw m1, m6<br>
- pmaddubsw m7, m3, m5<br>
-<br>
- paddw m1, m7<br>
-<br>
- psubw m1, m4<br>
- movu [r2 + r3], m1<br>
-<br>
- movq m1, [r0 + r1]<br>
-<br>
- punpcklbw m0, m1<br>
-<br>
- pmaddubsw m2, m6<br>
- pmaddubsw m0, m5<br>
-<br>
- paddw m2, m0<br>
-<br>
- psubw m2, m4<br>
- lea r2, [r2 + 2 * r3]<br>
- movu [r2], m2<br>
-<br>
- movq m2, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m1, m2<br>
-<br>
- pmaddubsw m3, m6<br>
- pmaddubsw m1, m5<br>
-<br>
- paddw m3, m1<br>
- psubw m3, m4<br>
-<br>
- movu [r2 + r3], m3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W8_H8_H16_H32 8, 8<br>
-FILTER_V_PS_W8_H8_H16_H32 8, 16<br>
-FILTER_V_PS_W8_H8_H16_H32 8, 32<br>
-<br>
-;------------------------------------------------------------------------------------------------------------<br>
-;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W6 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m5, [r5 + r4 * 4]<br>
-%else<br>
- movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m6, m5, [tab_Vm]<br>
- pshufb m5, [tab_Vm + 16]<br>
- mova m4, [pw_2000]<br>
- lea r5, [3 * r1]<br>
- mov r4d, %2/4<br>
-<br>
-.loop:<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- movq m2, [r0 + 2 * r1]<br>
- movq m3, [r0 + r5]<br>
-<br>
- punpcklbw m0, m1<br>
- punpcklbw m1, m2<br>
- punpcklbw m2, m3<br>
-<br>
- pmaddubsw m0, m6<br>
- pmaddubsw m7, m2, m5<br>
-<br>
- paddw m0, m7<br>
- psubw m0, m4<br>
-<br>
- movh [r2], m0<br>
- pshufd m0, m0, 2<br>
- movd [r2 + 8], m0<br>
-<br>
- lea r0, [r0 + 4 * r1]<br>
- movq m0, [r0]<br>
- punpcklbw m3, m0<br>
-<br>
- pmaddubsw m1, m6<br>
- pmaddubsw m7, m3, m5<br>
-<br>
- paddw m1, m7<br>
- psubw m1, m4<br>
-<br>
- movh [r2 + r3], m1<br>
- pshufd m1, m1, 2<br>
- movd [r2 + r3 + 8], m1<br>
-<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
-<br>
- pmaddubsw m2, m6<br>
- pmaddubsw m0, m5<br>
-<br>
- paddw m2, m0<br>
- psubw m2, m4<br>
-<br>
- lea r2,[r2 + 2 * r3]<br>
- movh [r2], m2<br>
- pshufd m2, m2, 2<br>
- movd [r2 + 8], m2<br>
-<br>
- movq m2,[r0 + 2 * r1]<br>
- punpcklbw m1, m2<br>
-<br>
- pmaddubsw m3, m6<br>
- pmaddubsw m1, m5<br>
-<br>
- paddw m3, m1<br>
- psubw m3, m4<br>
-<br>
- movh [r2 + r3], m3<br>
- pshufd m3, m3, 2<br>
- movd [r2 + r3 + 8], m3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W6 6, 8<br>
-FILTER_V_PS_W6 6, 16<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W12 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m1, m0, [tab_Vm]<br>
- pshufb m0, [tab_Vm + 16]<br>
-<br>
- mov r4d, %2/2<br>
-<br>
-.loop:<br>
- movu m2, [r0]<br>
- movu m3, [r0 + r1]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movu m5, [r0]<br>
- movu m7, [r0 + r1]<br>
-<br>
- punpcklbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m4, m6<br>
-<br>
- punpckhbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m2, m6<br>
-<br>
- mova m6, [pw_2000]<br>
-<br>
- psubw m4, m6<br>
- psubw m2, m6<br>
-<br>
- movu [r2], m4<br>
- movh [r2 + 16], m2<br>
-<br>
- punpcklbw m4, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m3, m1<br>
-<br>
- movu m2, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m5, m7, m2<br>
- punpckhbw m7, m2<br>
-<br>
- pmaddubsw m5, m0<br>
- pmaddubsw m7, m0<br>
-<br>
- paddw m4, m5<br>
- paddw m3, m7<br>
-<br>
- psubw m4, m6<br>
- psubw m3, m6<br>
-<br>
- movu [r2 + r3], m4<br>
- movh [r2 + r3 + 16], m3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W12 12, 16<br>
-FILTER_V_PS_W12 12, 32<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W16 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m1, m0, [tab_Vm]<br>
- pshufb m0, [tab_Vm + 16]<br>
- mov r4d, %2/2<br>
-<br>
-.loop:<br>
- movu m2, [r0]<br>
- movu m3, [r0 + r1]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movu m5, [r0]<br>
- movu m7, [r0 + r1]<br>
-<br>
- punpcklbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m4, m6<br>
-<br>
- punpckhbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m2, m6<br>
-<br>
- mova m6, [pw_2000]<br>
-<br>
- psubw m4, m6<br>
- psubw m2, m6<br>
-<br>
- movu [r2], m4<br>
- movu [r2 + 16], m2<br>
-<br>
- punpcklbw m4, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m3, m1<br>
-<br>
- movu m5, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m2, m7, m5<br>
- punpckhbw m7, m5<br>
-<br>
- pmaddubsw m2, m0<br>
- pmaddubsw m7, m0<br>
-<br>
- paddw m4, m2<br>
- paddw m3, m7<br>
-<br>
- psubw m4, m6<br>
- psubw m3, m6<br>
-<br>
- movu [r2 + r3], m4<br>
- movu [r2 + r3 + 16], m3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W16 16, 4<br>
-FILTER_V_PS_W16 16, 8<br>
-FILTER_V_PS_W16 16, 12<br>
-FILTER_V_PS_W16 16, 16<br>
-FILTER_V_PS_W16 16, 32<br>
-<br>
-FILTER_V_PS_W16 16, 24<br>
-FILTER_V_PS_W16 16, 64<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V4_PS_W24 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m1, m0, [tab_Vm]<br>
- pshufb m0, [tab_Vm + 16]<br>
-<br>
- mov r4d, %2/2<br>
-<br>
-.loop:<br>
- movu m2, [r0]<br>
- movu m3, [r0 + r1]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- lea r5, [r0 + 2 * r1]<br>
-<br>
- movu m5, [r5]<br>
- movu m7, [r5 + r1]<br>
-<br>
- punpcklbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m4, m6<br>
-<br>
- punpckhbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m2, m6<br>
-<br>
- mova m6, [pw_2000]<br>
-<br>
- psubw m4, m6<br>
- psubw m2, m6<br>
-<br>
- movu [r2], m4<br>
- movu [r2 + 16], m2<br>
-<br>
- punpcklbw m4, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m3, m1<br>
-<br>
- movu m2, [r5 + 2 * r1]<br>
-<br>
- punpcklbw m5, m7, m2<br>
- punpckhbw m7, m2<br>
-<br>
- pmaddubsw m5, m0<br>
- pmaddubsw m7, m0<br>
-<br>
- paddw m4, m5<br>
- paddw m3, m7<br>
-<br>
- psubw m4, m6<br>
- psubw m3, m6<br>
-<br>
- movu [r2 + r3], m4<br>
- movu [r2 + r3 + 16], m3<br>
-<br>
- movq m2, [r0 + 16]<br>
- movq m3, [r0 + r1 + 16]<br>
- movq m4, [r5 + 16]<br>
- movq m5, [r5 + r1 + 16]<br>
-<br>
- punpcklbw m2, m3<br>
- punpcklbw m7, m4, m5<br>
-<br>
- pmaddubsw m2, m1<br>
- pmaddubsw m7, m0<br>
-<br>
- paddw m2, m7<br>
- psubw m2, m6<br>
-<br>
- movu [r2 + 32], m2<br>
-<br>
- movq m2, [r5 + 2 * r1 + 16]<br>
-<br>
- punpcklbw m3, m4<br>
- punpcklbw m5, m2<br>
-<br>
- pmaddubsw m3, m1<br>
- pmaddubsw m5, m0<br>
-<br>
- paddw m3, m5<br>
- psubw m3, m6<br>
-<br>
- movu [r2 + r3 + 32], m3<br>
-<br>
- mov r0, r5<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_PS_W24 24, 32<br>
-<br>
-FILTER_V4_PS_W24 24, 64<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W32 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m1, m0, [tab_Vm]<br>
- pshufb m0, [tab_Vm + 16]<br>
-<br>
- mova m7, [pw_2000]<br>
-<br>
- mov r4d, %2<br>
-<br>
-.loop:<br>
- movu m2, [r0]<br>
- movu m3, [r0 + r1]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- lea r5, [r0 + 2 * r1]<br>
- movu m3, [r5]<br>
- movu m5, [r5 + r1]<br>
-<br>
- punpcklbw m6, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m6, m0<br>
- pmaddubsw m3, m0<br>
-<br>
- paddw m4, m6<br>
- paddw m2, m3<br>
-<br>
- psubw m4, m7<br>
- psubw m2, m7<br>
-<br>
- movu [r2], m4<br>
- movu [r2 + 16], m2<br>
-<br>
- movu m2, [r0 + 16]<br>
- movu m3, [r0 + r1 + 16]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- movu m3, [r5 + 16]<br>
- movu m5, [r5 + r1 + 16]<br>
-<br>
- punpcklbw m6, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m6, m0<br>
- pmaddubsw m3, m0<br>
-<br>
- paddw m4, m6<br>
- paddw m2, m3<br>
-<br>
- psubw m4, m7<br>
- psubw m2, m7<br>
-<br>
- movu [r2 + 32], m4<br>
- movu [r2 + 48], m2<br>
-<br>
- lea r0, [r0 + r1]<br>
- lea r2, [r2 + r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W32 32, 8<br>
-FILTER_V_PS_W32 32, 16<br>
-FILTER_V_PS_W32 32, 24<br>
-FILTER_V_PS_W32 32, 32<br>
-<br>
-FILTER_V_PS_W32 32, 48<br>
-FILTER_V_PS_W32 32, 64<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W8_H8_H16_H32 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m5, [r5 + r4 * 4]<br>
-%else<br>
-movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m6, m5, [tab_Vm]<br>
-pshufb m5, [tab_Vm + 16]<br>
-mova m4, [pw_512]<br>
-lea r5, [r1 * 3]<br>
-<br>
-mov r4d, %2<br>
-<br>
-.loop:<br>
-movq m0, [r0]<br>
-movq m1, [r0 + r1]<br>
-movq m2, [r0 + 2 * r1]<br>
-movq m3, [r0 + r5]<br>
-<br>
-punpcklbw m0, m1<br>
-punpcklbw m1, m2<br>
-punpcklbw m2, m3<br>
-<br>
-pmaddubsw m0, m6<br>
-pmaddubsw m7, m2, m5<br>
-<br>
-paddw m0, m7<br>
-<br>
-pmulhrsw m0, m4<br>
-packuswb m0, m0<br>
-movh [r2], m0<br>
-<br>
-lea r0, [r0 + 4 * r1]<br>
-movq m0, [r0]<br>
-<br>
-punpcklbw m3, m0<br>
-<br>
-pmaddubsw m1, m6<br>
-pmaddubsw m7, m3, m5<br>
-<br>
-paddw m1, m7<br>
-<br>
-pmulhrsw m1, m4<br>
-packuswb m1, m1<br>
-movh [r2 + r3], m1<br>
-<br>
-movq m1, [r0 + r1]<br>
-<br>
-punpcklbw m0, m1<br>
-<br>
-pmaddubsw m2, m6<br>
-pmaddubsw m0, m5<br>
-<br>
-paddw m2, m0<br>
-<br>
-pmulhrsw m2, m4<br>
-<br>
-movq m7, [r0 + 2 * r1]<br>
-punpcklbw m1, m7<br>
-<br>
-pmaddubsw m3, m6<br>
-pmaddubsw m1, m5<br>
-<br>
-paddw m3, m1<br>
-<br>
-pmulhrsw m3, m4<br>
-packuswb m2, m3<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-movh [r2], m2<br>
-movhps [r2 + r3], m2<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 4<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W8_H8_H16_H32 8, 8<br>
-FILTER_V4_W8_H8_H16_H32 8, 16<br>
-FILTER_V4_W8_H8_H16_H32 8, 32<br>
-<br>
-FILTER_V4_W8_H8_H16_H32 8, 12<br>
-FILTER_V4_W8_H8_H16_H32 8, 64<br>
-<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W6_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m5, [r5 + r4 * 4]<br>
-%else<br>
-movd m5, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m6, m5, [tab_Vm]<br>
-pshufb m5, [tab_Vm + 16]<br>
-mova m4, [pw_512]<br>
-<br>
-mov r4d, %2<br>
-lea r5, [3 * r1]<br>
-<br>
-.loop:<br>
-movq m0, [r0]<br>
-movq m1, [r0 + r1]<br>
-movq m2, [r0 + 2 * r1]<br>
-movq m3, [r0 + r5]<br>
-<br>
-punpcklbw m0, m1<br>
-punpcklbw m1, m2<br>
-punpcklbw m2, m3<br>
-<br>
-pmaddubsw m0, m6<br>
-pmaddubsw m7, m2, m5<br>
-<br>
-paddw m0, m7<br>
-<br>
-pmulhrsw m0, m4<br>
-packuswb m0, m0<br>
-movd [r2], m0<br>
-pextrw [r2 + 4], m0, 2<br>
-<br>
-lea r0, [r0 + 4 * r1]<br>
-<br>
-movq m0, [r0]<br>
-punpcklbw m3, m0<br>
-<br>
-pmaddubsw m1, m6<br>
-pmaddubsw m7, m3, m5<br>
-<br>
-paddw m1, m7<br>
-<br>
-pmulhrsw m1, m4<br>
-packuswb m1, m1<br>
-movd [r2 + r3], m1<br>
-pextrw [r2 + r3 + 4], m1, 2<br>
-<br>
-movq m1, [r0 + r1]<br>
-punpcklbw m7, m0, m1<br>
-<br>
-pmaddubsw m2, m6<br>
-pmaddubsw m7, m5<br>
-<br>
-paddw m2, m7<br>
-<br>
-pmulhrsw m2, m4<br>
-packuswb m2, m2<br>
-lea r2, [r2 + 2 * r3]<br>
-movd [r2], m2<br>
-pextrw [r2 + 4], m2, 2<br>
-<br>
-movq m2, [r0 + 2 * r1]<br>
-punpcklbw m1, m2<br>
-<br>
-pmaddubsw m3, m6<br>
-pmaddubsw m1, m5<br>
-<br>
-paddw m3, m1<br>
-<br>
-pmulhrsw m3, m4<br>
-packuswb m3, m3<br>
-<br>
-movd [r2 + r3], m3<br>
-pextrw [r2 + r3 + 4], m3, 2<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 4<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W6_H4 6, 8<br>
-<br>
-FILTER_V4_W6_H4 6, 16<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W12_H2 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m1, m0, [tab_Vm]<br>
-pshufb m0, [tab_Vm + 16]<br>
-<br>
-mov r4d, %2<br>
-<br>
-.loop:<br>
-movu m2, [r0]<br>
-movu m3, [r0 + r1]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-lea r0, [r0 + 2 * r1]<br>
-movu m5, [r0]<br>
-movu m7, [r0 + r1]<br>
-<br>
-punpcklbw m6, m5, m7<br>
-pmaddubsw m6, m0<br>
-paddw m4, m6<br>
-<br>
-punpckhbw m6, m5, m7<br>
-pmaddubsw m6, m0<br>
-paddw m2, m6<br>
-<br>
-mova m6, [pw_512]<br>
-<br>
-pmulhrsw m4, m6<br>
-pmulhrsw m2, m6<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movh [r2], m4<br>
-pextrd [r2 + 8], m4, 2<br>
-<br>
-punpcklbw m4, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m3, m1<br>
-<br>
-movu m5, [r0 + 2 * r1]<br>
-<br>
-punpcklbw m2, m7, m5<br>
-punpckhbw m7, m5<br>
-<br>
-pmaddubsw m2, m0<br>
-pmaddubsw m7, m0<br>
-<br>
-paddw m4, m2<br>
-paddw m3, m7<br>
-<br>
-pmulhrsw m4, m6<br>
-pmulhrsw m3, m6<br>
-<br>
-packuswb m4, m3<br>
-<br>
-movh [r2 + r3], m4<br>
-pextrd [r2 + r3 + 8], m4, 2<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 2<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W12_H2 12, 16<br>
-<br>
-FILTER_V4_W12_H2 12, 32<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W16_H2 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m1, m0, [tab_Vm]<br>
-pshufb m0, [tab_Vm + 16]<br>
-<br>
-mov r4d, %2/2<br>
-<br>
-.loop:<br>
-movu m2, [r0]<br>
-movu m3, [r0 + r1]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-lea r0, [r0 + 2 * r1]<br>
-movu m5, [r0]<br>
-movu m6, [r0 + r1]<br>
-<br>
-punpckhbw m7, m5, m6<br>
-pmaddubsw m7, m0<br>
-paddw m2, m7<br>
-<br>
-punpcklbw m7, m5, m6<br>
-pmaddubsw m7, m0<br>
-paddw m4, m7<br>
-<br>
-mova m7, [pw_512]<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m2, m7<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movu [r2], m4<br>
-<br>
-punpcklbw m4, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m3, m1<br>
-<br>
-movu m5, [r0 + 2 * r1]<br>
-<br>
-punpcklbw m2, m6, m5<br>
-punpckhbw m6, m5<br>
-<br>
-pmaddubsw m2, m0<br>
-pmaddubsw m6, m0<br>
-<br>
-paddw m4, m2<br>
-paddw m3, m6<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m3, m7<br>
-<br>
-packuswb m4, m3<br>
-<br>
-movu [r2 + r3], m4<br>
-<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-dec r4d<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W16_H2 16, 4<br>
-FILTER_V4_W16_H2 16, 8<br>
-FILTER_V4_W16_H2 16, 12<br>
-FILTER_V4_W16_H2 16, 16<br>
-FILTER_V4_W16_H2 16, 32<br>
-<br>
-FILTER_V4_W16_H2 16, 24<br>
-FILTER_V4_W16_H2 16, 64<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W24 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m1, m0, [tab_Vm]<br>
-pshufb m0, [tab_Vm + 16]<br>
-<br>
-mov r4d, %2<br>
-<br>
-.loop:<br>
-movu m2, [r0]<br>
-movu m3, [r0 + r1]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-lea r5, [r0 + 2 * r1]<br>
-movu m5, [r5]<br>
-movu m7, [r5 + r1]<br>
-<br>
-punpcklbw m6, m5, m7<br>
-pmaddubsw m6, m0<br>
-paddw m4, m6<br>
-<br>
-punpckhbw m6, m5, m7<br>
-pmaddubsw m6, m0<br>
-paddw m2, m6<br>
-<br>
-mova m6, [pw_512]<br>
-<br>
-pmulhrsw m4, m6<br>
-pmulhrsw m2, m6<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movu [r2], m4<br>
-<br>
-punpcklbw m4, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m3, m1<br>
-<br>
-movu m2, [r5 + 2 * r1]<br>
-<br>
-punpcklbw m5, m7, m2<br>
-punpckhbw m7, m2<br>
-<br>
-pmaddubsw m5, m0<br>
-pmaddubsw m7, m0<br>
-<br>
-paddw m4, m5<br>
-paddw m3, m7<br>
-<br>
-pmulhrsw m4, m6<br>
-pmulhrsw m3, m6<br>
-<br>
-packuswb m4, m3<br>
-<br>
-movu [r2 + r3], m4<br>
-<br>
-movq m2, [r0 + 16]<br>
-movq m3, [r0 + r1 + 16]<br>
-movq m4, [r5 + 16]<br>
-movq m5, [r5 + r1 + 16]<br>
-<br>
-punpcklbw m2, m3<br>
-punpcklbw m4, m5<br>
-<br>
-pmaddubsw m2, m1<br>
-pmaddubsw m4, m0<br>
-<br>
-paddw m2, m4<br>
-<br>
-pmulhrsw m2, m6<br>
-<br>
-movq m3, [r0 + r1 + 16]<br>
-movq m4, [r5 + 16]<br>
-movq m5, [r5 + r1 + 16]<br>
-movq m7, [r5 + 2 * r1 + 16]<br>
-<br>
-punpcklbw m3, m4<br>
-punpcklbw m5, m7<br>
-<br>
-pmaddubsw m3, m1<br>
-pmaddubsw m5, m0<br>
-<br>
-paddw m3, m5<br>
-<br>
-pmulhrsw m3, m6<br>
-packuswb m2, m3<br>
-<br>
-movh [r2 + 16], m2<br>
-movhps [r2 + r3 + 16], m2<br>
-<br>
-mov r0, r5<br>
-lea r2, [r2 + 2 * r3]<br>
-<br>
-sub r4, 2<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W24 24, 32<br>
-<br>
-FILTER_V4_W24 24, 64<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W32 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m1, m0, [tab_Vm]<br>
-pshufb m0, [tab_Vm + 16]<br>
-<br>
-mova m7, [pw_512]<br>
-<br>
-mov r4d, %2<br>
-<br>
-.loop:<br>
-movu m2, [r0]<br>
-movu m3, [r0 + r1]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-lea r5, [r0 + 2 * r1]<br>
-movu m3, [r5]<br>
-movu m5, [r5 + r1]<br>
-<br>
-punpcklbw m6, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m6, m0<br>
-pmaddubsw m3, m0<br>
-<br>
-paddw m4, m6<br>
-paddw m2, m3<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m2, m7<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movu [r2], m4<br>
-<br>
-movu m2, [r0 + 16]<br>
-movu m3, [r0 + r1 + 16]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-movu m3, [r5 + 16]<br>
-movu m5, [r5 + r1 + 16]<br>
-<br>
-punpcklbw m6, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m6, m0<br>
-pmaddubsw m3, m0<br>
-<br>
-paddw m4, m6<br>
-paddw m2, m3<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m2, m7<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movu [r2 + 16], m4<br>
-<br>
-lea r0, [r0 + r1]<br>
-lea r2, [r2 + r3]<br>
-<br>
-dec r4<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W32 32, 8<br>
-FILTER_V4_W32 32, 16<br>
-FILTER_V4_W32 32, 24<br>
-FILTER_V4_W32 32, 32<br>
-<br>
-FILTER_V4_W32 32, 48<br>
-FILTER_V4_W32 32, 64<br>
-<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------<br>
-%macro FILTER_V4_W16n_H2 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8<br>
-<br>
-mov r4d, r4m<br>
-sub r0, r1<br>
-<br>
-%ifdef PIC<br>
-lea r5, [tab_ChromaCoeff]<br>
-movd m0, [r5 + r4 * 4]<br>
-%else<br>
-movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
-pshufb m1, m0, [tab_Vm]<br>
-pshufb m0, [tab_Vm + 16]<br>
-<br>
-mov r4d, %2/2<br>
-<br>
-.loop:<br>
-<br>
-mov r6d, %1/16<br>
-<br>
-.loopW:<br>
-<br>
-movu m2, [r0]<br>
-movu m3, [r0 + r1]<br>
-<br>
-punpcklbw m4, m2, m3<br>
-punpckhbw m2, m3<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m2, m1<br>
-<br>
-lea r5, [r0 + 2 * r1]<br>
-movu m5, [r5]<br>
-movu m6, [r5 + r1]<br>
-<br>
-punpckhbw m7, m5, m6<br>
-pmaddubsw m7, m0<br>
-paddw m2, m7<br>
-<br>
-punpcklbw m7, m5, m6<br>
-pmaddubsw m7, m0<br>
-paddw m4, m7<br>
-<br>
-mova m7, [pw_512]<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m2, m7<br>
-<br>
-packuswb m4, m2<br>
-<br>
-movu [r2], m4<br>
-<br>
-punpcklbw m4, m3, m5<br>
-punpckhbw m3, m5<br>
-<br>
-pmaddubsw m4, m1<br>
-pmaddubsw m3, m1<br>
-<br>
-movu m5, [r5 + 2 * r1]<br>
-<br>
-punpcklbw m2, m6, m5<br>
-punpckhbw m6, m5<br>
-<br>
-pmaddubsw m2, m0<br>
-pmaddubsw m6, m0<br>
-<br>
-paddw m4, m2<br>
-paddw m3, m6<br>
-<br>
-pmulhrsw m4, m7<br>
-pmulhrsw m3, m7<br>
-<br>
-packuswb m4, m3<br>
-<br>
-movu [r2 + r3], m4<br>
-<br>
-add r0, 16<br>
-add r2, 16<br>
-dec r6d<br>
-jnz .loopW<br>
-<br>
-lea r0, [r0 + r1 * 2 - %1]<br>
-lea r2, [r2 + r3 * 2 - %1]<br>
-<br>
-dec r4d<br>
-jnz .loop<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V4_W16n_H2 64, 64<br>
-FILTER_V4_W16n_H2 64, 32<br>
-FILTER_V4_W16n_H2 64, 48<br>
-FILTER_V4_W16n_H2 48, 64<br>
-FILTER_V4_W16n_H2 64, 16<br>
-<br>
-<br>
-;-----------------------------------------------------------------------------<br>
-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)<br>
-;-----------------------------------------------------------------------------<br>
-INIT_XMM ssse3<br>
-cglobal luma_p2s, 3, 7, 6<br>
-<br>
- ; load width and height<br>
- mov r3d, r3m<br>
- mov r4d, r4m<br>
-<br>
- ; load constant<br>
- mova m4, [pb_128]<br>
- mova m5, [tab_c_64_n64]<br>
-<br>
-.loopH:<br>
-<br>
- xor r5d, r5d<br>
-.loopW:<br>
- lea r6, [r0 + r5]<br>
-<br>
- movh m0, [r6]<br>
- punpcklbw m0, m4<br>
- pmaddubsw m0, m5<br>
-<br>
- movh m1, [r6 + r1]<br>
- punpcklbw m1, m4<br>
- pmaddubsw m1, m5<br>
-<br>
- movh m2, [r6 + r1 * 2]<br>
- punpcklbw m2, m4<br>
- pmaddubsw m2, m5<br>
-<br>
- lea r6, [r6 + r1 * 2]<br>
- movh m3, [r6 + r1]<br>
- punpcklbw m3, m4<br>
- pmaddubsw m3, m5<br>
-<br>
- add r5, 8<br>
- cmp r5, r3<br>
- jg .width4<br>
- movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0<br>
- movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1<br>
- movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2<br>
- movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3<br>
- je .nextH<br>
- jmp .loopW<br>
-<br>
-.width4:<br>
- movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0<br>
- movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1<br>
- movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2<br>
- movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3<br>
-<br>
-.nextH:<br>
- lea r0, [r0 + r1 * 4]<br>
- add r2, FENC_STRIDE * 8<br>
-<br>
- sub r4d, 4<br>
- jnz .loopH<br>
-<br>
- RET<br>
-<br>
-%macro PROCESS_LUMA_W4_4R 0<br>
- movd m0, [r0]<br>
- movd m1, [r0 + r1]<br>
- punpcklbw m2, m0, m1 ; m2=[0 1]<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m0, [r0]<br>
- punpcklbw m1, m0 ; m1=[1 2]<br>
- punpcklqdq m2, m1 ; m2=[0 1 1 2]<br>
- pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklbw m5, m0, m1 ; m2=[2 3]<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m0, [r0]<br>
- punpcklbw m1, m0 ; m1=[3 4]<br>
- punpcklqdq m5, m1 ; m5=[2 3 3 4]<br>
- pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]<br>
- paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2<br>
- pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklbw m2, m0, m1 ; m2=[4 5]<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m0, [r0]<br>
- punpcklbw m1, m0 ; m1=[5 6]<br>
- punpcklqdq m2, m1 ; m2=[4 5 5 6]<br>
- pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]<br>
- paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2<br>
- pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]<br>
- paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklbw m2, m0, m1 ; m2=[6 7]<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m0, [r0]<br>
- punpcklbw m1, m0 ; m1=[7 8]<br>
- punpcklqdq m2, m1 ; m2=[6 7 7 8]<br>
- pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]<br>
- paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end<br>
- pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]<br>
- paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklbw m2, m0, m1 ; m2=[8 9]<br>
- movd m0, [r0 + 2 * r1]<br>
- punpcklbw m1, m0 ; m1=[9 10]<br>
- punpcklqdq m2, m1 ; m2=[8 9 9 10]<br>
- pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]<br>
- paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end<br>
-%endmacro<br>
-<br>
-%macro PROCESS_LUMA_W8_4R 0<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
- pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m0, [r0]<br>
- punpcklbw m1, m0<br>
- pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2<br>
-<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
- pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3<br>
- pmaddubsw m0, [r6 + 1 * 16]<br>
- paddw m7, m0 ;m7=[0+1+2+3] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m0, [r0]<br>
- punpcklbw m1, m0<br>
- pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4<br>
- pmaddubsw m1, [r6 + 1 * 16]<br>
- paddw m6, m1 ;m6 = [1+2+3+4] Row2<br>
-<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
- pmaddubsw m2, m0, [r6 + 1 * 16]<br>
- pmaddubsw m0, [r6 + 2 * 16]<br>
- paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1<br>
- paddw m5, m2 ;m5=[2+3+4+5] Row3<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m0, [r0]<br>
- punpcklbw m1, m0<br>
- pmaddubsw m2, m1, [r6 + 1 * 16]<br>
- pmaddubsw m1, [r6 + 2 * 16]<br>
- paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2<br>
- paddw m4, m2 ;m4=[3+4+5+6] Row4<br>
-<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
- pmaddubsw m2, m0, [r6 + 2 * 16]<br>
- pmaddubsw m0, [r6 + 3 * 16]<br>
- paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end<br>
- paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m0, [r0]<br>
- punpcklbw m1, m0<br>
- pmaddubsw m2, m1, [r6 + 2 * 16]<br>
- pmaddubsw m1, [r6 + 3 * 16]<br>
- paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end<br>
- paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4<br>
-<br>
- movq m1, [r0 + r1]<br>
- punpcklbw m0, m1<br>
- pmaddubsw m0, [r6 + 3 * 16]<br>
- paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end<br>
-<br>
- movq m0, [r0 + 2 * r1]<br>
- punpcklbw m1, m0<br>
- pmaddubsw m1, [r6 + 3 * 16]<br>
- paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end<br>
-%endmacro<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA_4xN 3<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6<br>
- lea r5, [3 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-%ifidn %3,ps<br>
- add r3d, r3d<br>
-%endif<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffVer]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffVer + r4]<br>
-%endif<br>
-<br>
-%ifidn %3,pp<br>
- mova m3, [pw_512]<br>
-%else<br>
- mova m3, [pw_2000]<br>
-%endif<br>
-<br>
- mov r4d, %2/4<br>
- lea r5, [4 * r1]<br>
-<br>
-.loopH:<br>
- PROCESS_LUMA_W4_4R<br>
-<br>
-%ifidn %3,pp<br>
- pmulhrsw m4, m3<br>
- pmulhrsw m5, m3<br>
-<br>
- packuswb m4, m5<br>
-<br>
- movd [r2], m4<br>
- pextrd [r2 + r3], m4, 1<br>
- lea r2, [r2 + 2 * r3]<br>
- pextrd [r2], m4, 2<br>
- pextrd [r2 + r3], m4, 3<br>
-%else<br>
- psubw m4, m3<br>
- psubw m5, m3<br>
-<br>
- movlps [r2], m4<br>
- movhps [r2 + r3], m4<br>
- lea r2, [r2 + 2 * r3]<br>
- movlps [r2], m5<br>
- movhps [r2 + r3], m5<br>
-%endif<br>
-<br>
- sub r0, r5<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-<br>
-INIT_YMM avx2<br>
-cglobal interp_8tap_vert_pp_4x4, 4,6,8<br>
- mov r4d, r4m<br>
- lea r5, [r1 * 3]<br>
- sub r0, r5<br>
-<br>
- ; TODO: VPGATHERDD<br>
- movd xm1, [r0] ; m1 = row0<br>
- movd xm2, [r0 + r1] ; m2 = row1<br>
- punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]<br>
-<br>
- movd xm3, [r0 + r1 * 2] ; m3 = row2<br>
- punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]<br>
- movd xm4, [r0 + r5]<br>
- punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]<br>
- punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]<br>
-<br>
- lea r0, [r0 + r1 * 4]<br>
- movd xm5, [r0] ; m5 = row4<br>
- punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]<br>
- punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]<br>
- vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]<br>
- movd xm2, [r0 + r1] ; m2 = row5<br>
- punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]<br>
- punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]<br>
- movd xm6, [r0 + r1 * 2] ; m6 = row6<br>
- punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]<br>
- punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]<br>
- vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]<br>
- movd xm4, [r0 + r5] ; m4 = row7<br>
- punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]<br>
- punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]<br>
-<br>
- lea r0, [r0 + r1 * 4]<br>
- movd xm7, [r0] ; m7 = row8<br>
- punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]<br>
- punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]<br>
- vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]<br>
- movd xm2, [r0 + r1] ; m2 = row9<br>
- punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]<br>
- punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]<br>
- movd xm7, [r0 + r1 * 2] ; m7 = rowA<br>
- punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]<br>
- punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]<br>
- vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]<br>
-<br>
- ; load filter coeff<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeff]<br>
- vpbroadcastd m0, [r5 + r4 * 8 + 0]<br>
- vpbroadcastd m2, [r5 + r4 * 8 + 4]<br>
-%else<br>
- vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]<br>
- vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]<br>
-%endif<br>
-<br>
- pmaddubsw m1, m0<br>
- pmaddubsw m3, m0<br>
- pmaddubsw m5, m2<br>
- pmaddubsw m6, m2<br>
- vbroadcasti128 m0, [pw_1]<br>
- pmaddwd m1, m0<br>
- pmaddwd m3, m0<br>
- pmaddwd m5, m0<br>
- pmaddwd m6, m0<br>
- paddd m1, m5 ; m1 = DQWORD ROW[1 0]<br>
- paddd m3, m6 ; m3 = DQWORD ROW[3 2]<br>
- packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]<br>
-<br>
- ; TODO: does it overflow?<br>
- pmulhrsw m1, [pw_512]<br>
- vextracti128 xm2, m1, 1<br>
- packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]<br>
- movd [r2], xm1<br>
- pextrd [r2 + r3], xm1, 2<br>
- pextrd [r2 + r3 * 2], xm1, 1<br>
- lea r4, [r3 * 3]<br>
- pextrd [r2 + r4], xm1, 3<br>
- RET<br>
-<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 4, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 8, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 16, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 4, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 8, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_4xN 4, 16, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA_8xN 3<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8<br>
- lea r5, [3 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-<br>
-%ifidn %3,ps<br>
- add r3d, r3d<br>
-%endif<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffVer]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffVer + r4]<br>
-%endif<br>
-<br>
- %ifidn %3,pp<br>
- mova m3, [pw_512]<br>
-%else<br>
- mova m3, [pw_2000]<br>
-%endif<br>
-<br>
- mov r4d, %2/4<br>
- lea r5, [4 * r1]<br>
-<br>
-.loopH:<br>
- PROCESS_LUMA_W8_4R<br>
-<br>
-%ifidn %3,pp<br>
- pmulhrsw m7, m3<br>
- pmulhrsw m6, m3<br>
- pmulhrsw m5, m3<br>
- pmulhrsw m4, m3<br>
-<br>
- packuswb m7, m6<br>
- packuswb m5, m4<br>
-<br>
- movlps [r2], m7<br>
- movhps [r2 + r3], m7<br>
- lea r2, [r2 + 2 * r3]<br>
- movlps [r2], m5<br>
- movhps [r2 + r3], m5<br>
-%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- lea r2, [r2 + 2 * r3]<br>
- movu [r2], m5<br>
- movu [r2 + r3], m4<br>
-%endif<br>
-<br>
- sub r0, r5<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 4, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 8, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 16, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 32, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 4, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 8, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 16, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_8xN 8, 32, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA_12xN 3<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8<br>
- lea r5, [3 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-%ifidn %3,ps<br>
- add r3d, r3d<br>
-%endif<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffVer]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffVer + r4]<br>
-%endif<br>
-<br>
- %ifidn %3,pp<br>
- mova m3, [pw_512]<br>
-%else<br>
- mova m3, [pw_2000]<br>
-%endif<br>
-<br>
- mov r4d, %2/4<br>
-<br>
-.loopH:<br>
- PROCESS_LUMA_W8_4R<br>
-<br>
-%ifidn %3,pp<br>
- pmulhrsw m7, m3<br>
- pmulhrsw m6, m3<br>
- pmulhrsw m5, m3<br>
- pmulhrsw m4, m3<br>
-<br>
- packuswb m7, m6<br>
- packuswb m5, m4<br>
-<br>
- movlps [r2], m7<br>
- movhps [r2 + r3], m7<br>
- lea r5, [r2 + 2 * r3]<br>
- movlps [r5], m5<br>
- movhps [r5 + r3], m5<br>
-%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- lea r5, [r2 + 2 * r3]<br>
- movu [r5], m5<br>
- movu [r5 + r3], m4<br>
-%endif<br>
-<br>
- lea r5, [8 * r1 - 8]<br>
- sub r0, r5<br>
-%ifidn %3,pp<br>
- add r2, 8<br>
-%else<br>
- add r2, 16<br>
-%endif<br>
-<br>
- PROCESS_LUMA_W4_4R<br>
-<br>
-%ifidn %3,pp<br>
- pmulhrsw m4, m3<br>
- pmulhrsw m5, m3<br>
-<br>
- packuswb m4, m5<br>
-<br>
- movd [r2], m4<br>
- pextrd [r2 + r3], m4, 1<br>
- lea r5, [r2 + 2 * r3]<br>
- pextrd [r5], m4, 2<br>
- pextrd [r5 + r3], m4, 3<br>
-%else<br>
- psubw m4, m3<br>
- psubw m5, m3<br>
-<br>
- movlps [r2], m4<br>
- movhps [r2 + r3], m4<br>
- lea r5, [r2 + 2 * r3]<br>
- movlps [r5], m5<br>
- movhps [r5 + r3], m5<br>
-%endif<br>
-<br>
- lea r5, [4 * r1 + 8]<br>
- sub r0, r5<br>
-%ifidn %3,pp<br>
- lea r2, [r2 + 4 * r3 - 8]<br>
-%else<br>
- lea r2, [r2 + 4 * r3 - 16]<br>
-%endif<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_12xN 12, 16, pp<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-FILTER_VER_LUMA_12xN 12, 16, ps<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA 3<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize<br>
- lea r5, [3 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-%ifidn %3,ps<br>
- add r3d, r3d<br>
-%endif<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffVer]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffVer + r4]<br>
-%endif<br>
-<br>
-%ifidn %3,pp<br>
- mova m3, [pw_512]<br>
-%else<br>
- mova m3, [pw_2000]<br>
-%endif<br>
- mov dword [rsp], %2/4<br>
-<br>
-.loopH:<br>
- mov r4d, (%1/8)<br>
-.loopW:<br>
- PROCESS_LUMA_W8_4R<br>
-%ifidn %3,pp<br>
- pmulhrsw m7, m3<br>
- pmulhrsw m6, m3<br>
- pmulhrsw m5, m3<br>
- pmulhrsw m4, m3<br>
-<br>
- packuswb m7, m6<br>
- packuswb m5, m4<br>
-<br>
- movlps [r2], m7<br>
- movhps [r2 + r3], m7<br>
- lea r5, [r2 + 2 * r3]<br>
- movlps [r5], m5<br>
- movhps [r5 + r3], m5<br>
-%else<br>
- psubw m7, m3<br>
- psubw m6, m3<br>
- psubw m5, m3<br>
- psubw m4, m3<br>
-<br>
- movu [r2], m7<br>
- movu [r2 + r3], m6<br>
- lea r5, [r2 + 2 * r3]<br>
- movu [r5], m5<br>
- movu [r5 + r3], m4<br>
-%endif<br>
-<br>
- lea r5, [8 * r1 - 8]<br>
- sub r0, r5<br>
-%ifidn %3,pp<br>
- add r2, 8<br>
-%else<br>
- add r2, 16<br>
-%endif<br>
- dec r4d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + 4 * r1 - %1]<br>
-%ifidn %3,pp<br>
- lea r2, [r2 + 4 * r3 - %1]<br>
-%else<br>
- lea r2, [r2 + 4 * r3 - 2 * %1]<br>
-%endif<br>
-<br>
- dec dword [rsp]<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_LUMA 16, 4, pp<br>
-FILTER_VER_LUMA 16, 8, pp<br>
-FILTER_VER_LUMA 16, 12, pp<br>
-FILTER_VER_LUMA 16, 16, pp<br>
-FILTER_VER_LUMA 16, 32, pp<br>
-FILTER_VER_LUMA 16, 64, pp<br>
-FILTER_VER_LUMA 24, 32, pp<br>
-FILTER_VER_LUMA 32, 8, pp<br>
-FILTER_VER_LUMA 32, 16, pp<br>
-FILTER_VER_LUMA 32, 24, pp<br>
-FILTER_VER_LUMA 32, 32, pp<br>
-FILTER_VER_LUMA 32, 64, pp<br>
-FILTER_VER_LUMA 48, 64, pp<br>
-FILTER_VER_LUMA 64, 16, pp<br>
-FILTER_VER_LUMA 64, 32, pp<br>
-FILTER_VER_LUMA 64, 48, pp<br>
-FILTER_VER_LUMA 64, 64, pp<br>
-<br>
-FILTER_VER_LUMA 16, 4, ps<br>
-FILTER_VER_LUMA 16, 8, ps<br>
-FILTER_VER_LUMA 16, 12, ps<br>
-FILTER_VER_LUMA 16, 16, ps<br>
-FILTER_VER_LUMA 16, 32, ps<br>
-FILTER_VER_LUMA 16, 64, ps<br>
-FILTER_VER_LUMA 24, 32, ps<br>
-FILTER_VER_LUMA 32, 8, ps<br>
-FILTER_VER_LUMA 32, 16, ps<br>
-FILTER_VER_LUMA 32, 24, ps<br>
-FILTER_VER_LUMA 32, 32, ps<br>
-FILTER_VER_LUMA 32, 64, ps<br>
-FILTER_VER_LUMA 48, 64, ps<br>
-FILTER_VER_LUMA 64, 16, ps<br>
-FILTER_VER_LUMA 64, 32, ps<br>
-FILTER_VER_LUMA 64, 48, ps<br>
-FILTER_VER_LUMA 64, 64, ps<br>
-<br>
-%macro PROCESS_LUMA_SP_W4_4R 0<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m1, m4 ;m1=[1 2]<br>
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[2 3]<br>
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[3 4]<br>
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
- paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[4 5]<br>
- pmaddwd m6, m4, [r6 + 1 * 16]<br>
- paddd m2, m6 ;m2=[2+3+4+5] Row3<br>
- pmaddwd m4, [r6 + 2 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[5 6]<br>
- pmaddwd m6, m5, [r6 + 1 * 16]<br>
- paddd m3, m6 ;m3=[3+4+5+6] Row4<br>
- pmaddwd m5, [r6 + 2 * 16]<br>
- paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[6 7]<br>
- pmaddwd m6, m4, [r6 + 2 * 16]<br>
- paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3<br>
- pmaddwd m4, [r6 + 3 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[7 8]<br>
- pmaddwd m6, m5, [r6 + 2 * 16]<br>
- paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4<br>
- pmaddwd m5, [r6 + 3 * 16]<br>
- paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[8 9]<br>
- pmaddwd m4, [r6 + 3 * 16]<br>
- paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end<br>
-<br>
- movq m4, [r0 + 2 * r1]<br>
- punpcklwd m5, m4 ;m5=[9 10]<br>
- pmaddwd m5, [r6 + 3 * 16]<br>
- paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end<br>
-%endmacro<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA_SP 2<br>
-INIT_XMM sse4<br>
-cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize<br>
-<br>
- add r1d, r1d<br>
- lea r5, [r1 + 2 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m7, [tab_c_526336]<br>
-<br>
- mov dword [rsp], %2/4<br>
-.loopH:<br>
- mov r4d, (%1/4)<br>
-.loopW:<br>
- PROCESS_LUMA_SP_W4_4R<br>
-<br>
- paddd m0, m7<br>
- paddd m1, m7<br>
- paddd m2, m7<br>
- paddd m3, m7<br>
-<br>
- psrad m0, 12<br>
- psrad m1, 12<br>
- psrad m2, 12<br>
- psrad m3, 12<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- packuswb m0, m2<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
- lea r5, [r2 + 2 * r3]<br>
- pextrd [r5], m0, 2<br>
- pextrd [r5 + r3], m0, 3<br>
-<br>
- lea r5, [8 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 4<br>
-<br>
- dec r4d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + 4 * r1 - 2 * %1]<br>
- lea r2, [r2 + 4 * r3 - %1]<br>
-<br>
- dec dword [rsp]<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
- FILTER_VER_LUMA_SP 4, 4<br>
- FILTER_VER_LUMA_SP 8, 8<br>
- FILTER_VER_LUMA_SP 8, 4<br>
- FILTER_VER_LUMA_SP 4, 8<br>
- FILTER_VER_LUMA_SP 16, 16<br>
- FILTER_VER_LUMA_SP 16, 8<br>
- FILTER_VER_LUMA_SP 8, 16<br>
- FILTER_VER_LUMA_SP 16, 12<br>
- FILTER_VER_LUMA_SP 12, 16<br>
- FILTER_VER_LUMA_SP 16, 4<br>
- FILTER_VER_LUMA_SP 4, 16<br>
- FILTER_VER_LUMA_SP 32, 32<br>
- FILTER_VER_LUMA_SP 32, 16<br>
- FILTER_VER_LUMA_SP 16, 32<br>
- FILTER_VER_LUMA_SP 32, 24<br>
- FILTER_VER_LUMA_SP 24, 32<br>
- FILTER_VER_LUMA_SP 32, 8<br>
- FILTER_VER_LUMA_SP 8, 32<br>
- FILTER_VER_LUMA_SP 64, 64<br>
- FILTER_VER_LUMA_SP 64, 32<br>
- FILTER_VER_LUMA_SP 32, 64<br>
- FILTER_VER_LUMA_SP 64, 48<br>
- FILTER_VER_LUMA_SP 48, 64<br>
- FILTER_VER_LUMA_SP 64, 16<br>
- FILTER_VER_LUMA_SP 16, 64<br>
-<br>
-; TODO: combin of U and V is more performance, but need more register<br>
-; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it<br>
-INIT_XMM ssse3<br>
-cglobal chroma_p2s, 3, 7, 4<br>
-<br>
- ; load width and height<br>
- mov r3d, r3m<br>
- mov r4d, r4m<br>
-<br>
- ; load constant<br>
- mova m2, [pb_128]<br>
- mova m3, [tab_c_64_n64]<br>
-<br>
-.loopH:<br>
-<br>
- xor r5d, r5d<br>
-.loopW:<br>
- lea r6, [r0 + r5]<br>
-<br>
- movh m0, [r6]<br>
- punpcklbw m0, m2<br>
- pmaddubsw m0, m3<br>
-<br>
- movh m1, [r6 + r1]<br>
- punpcklbw m1, m2<br>
- pmaddubsw m1, m3<br>
-<br>
- add r5d, 8<br>
- cmp r5d, r3d<br>
- lea r6, [r2 + r5 * 2]<br>
- jg .width4<br>
- movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
- movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
- je .nextH<br>
- jmp .loopW<br>
-<br>
-.width4:<br>
- test r3d, 4<br>
- jz .width2<br>
- test r3d, 2<br>
- movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
- movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
- lea r6, [r6 + 8]<br>
- pshufd m0, m0, 2<br>
- pshufd m1, m1, 2<br>
- jz .nextH<br>
-<br>
-.width2:<br>
- movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
- movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
-<br>
-.nextH:<br>
- lea r0, [r0 + r1 * 2]<br>
- add r2, FENC_STRIDE / 2 * 4<br>
-<br>
- sub r4d, 2<br>
- jnz .loopH<br>
-<br>
- RET<br>
-<br>
-%macro PROCESS_CHROMA_SP_W4_4R 0<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m1, m4 ;m1=[1 2]<br>
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[2 3]<br>
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3] Row1 done<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[3 4]<br>
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
- paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[4 5]<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
- paddd m2, m4 ;m2=[2+3+4+5] Row3<br>
-<br>
- movq m4, [r0 + 2 * r1]<br>
- punpcklwd m5, m4 ;m5=[5 6]<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
- paddd m3, m5 ;m3=[3+4+5+6] Row4<br>
-%endmacro<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SP 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize<br>
-<br>
- add r1d, r1d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m6, [tab_c_526336]<br>
-<br>
- mov dword [rsp], %2/4<br>
-<br>
-.loopH:<br>
- mov r4d, (%1/4)<br>
-.loopW:<br>
- PROCESS_CHROMA_SP_W4_4R<br>
-<br>
- paddd m0, m6<br>
- paddd m1, m6<br>
- paddd m2, m6<br>
- paddd m3, m6<br>
-<br>
- psrad m0, 12<br>
- psrad m1, 12<br>
- psrad m2, 12<br>
- psrad m3, 12<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- packuswb m0, m2<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
- lea r5, [r2 + 2 * r3]<br>
- pextrd [r5], m0, 2<br>
- pextrd [r5 + r3], m0, 3<br>
-<br>
- lea r5, [4 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 4<br>
-<br>
- dec r4d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + 4 * r1 - 2 * %1]<br>
- lea r2, [r2 + 4 * r3 - %1]<br>
-<br>
- dec dword [rsp]<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
- FILTER_VER_CHROMA_SP 4, 4<br>
- FILTER_VER_CHROMA_SP 4, 8<br>
- FILTER_VER_CHROMA_SP 16, 16<br>
- FILTER_VER_CHROMA_SP 16, 8<br>
- FILTER_VER_CHROMA_SP 16, 12<br>
- FILTER_VER_CHROMA_SP 12, 16<br>
- FILTER_VER_CHROMA_SP 16, 4<br>
- FILTER_VER_CHROMA_SP 4, 16<br>
- FILTER_VER_CHROMA_SP 32, 32<br>
- FILTER_VER_CHROMA_SP 32, 16<br>
- FILTER_VER_CHROMA_SP 16, 32<br>
- FILTER_VER_CHROMA_SP 32, 24<br>
- FILTER_VER_CHROMA_SP 24, 32<br>
- FILTER_VER_CHROMA_SP 32, 8<br>
-<br>
- FILTER_VER_CHROMA_SP 16, 24<br>
- FILTER_VER_CHROMA_SP 16, 64<br>
- FILTER_VER_CHROMA_SP 12, 32<br>
- FILTER_VER_CHROMA_SP 4, 32<br>
- FILTER_VER_CHROMA_SP 32, 64<br>
- FILTER_VER_CHROMA_SP 32, 48<br>
- FILTER_VER_CHROMA_SP 24, 64<br>
-<br>
- FILTER_VER_CHROMA_SP 64, 64<br>
- FILTER_VER_CHROMA_SP 64, 32<br>
- FILTER_VER_CHROMA_SP 64, 48<br>
- FILTER_VER_CHROMA_SP 48, 64<br>
- FILTER_VER_CHROMA_SP 64, 16<br>
-<br>
-<br>
-%macro PROCESS_CHROMA_SP_W2_4R 1<br>
- movd m0, [r0]<br>
- movd m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m2, [r0]<br>
- punpcklwd m1, m2 ;m1=[1 2]<br>
- punpcklqdq m0, m1 ;m0=[0 1 1 2]<br>
- pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklwd m2, m1 ;m2=[2 3]<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movd m3, [r0]<br>
- punpcklwd m1, m3 ;m2=[3 4]<br>
- punpcklqdq m2, m1 ;m2=[2 3 3 4]<br>
-<br>
- pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2<br>
- pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4<br>
- paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2<br>
-<br>
- movd m1, [r0 + r1]<br>
- punpcklwd m3, m1 ;m3=[4 5]<br>
-<br>
- movd m4, [r0 + 2 * r1]<br>
- punpcklwd m1, m4 ;m1=[5 6]<br>
- punpcklqdq m3, m1 ;m2=[4 5 5 6]<br>
- pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4<br>
- paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4<br>
-%endmacro<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SP_W2_4R 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6<br>
-<br>
- add r1d, r1d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m5, [tab_c_526336]<br>
-<br>
- mov r4d, (%2/4)<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W2_4R r5<br>
-<br>
- paddd m0, m5<br>
- paddd m2, m5<br>
-<br>
- psrad m0, 12<br>
- psrad m2, 12<br>
-<br>
- packssdw m0, m2<br>
- packuswb m0, m0<br>
-<br>
- pextrw [r2], m0, 0<br>
- pextrw [r2 + r3], m0, 1<br>
- lea r2, [r2 + 2 * r3]<br>
- pextrw [r2], m0, 2<br>
- pextrw [r2 + r3], m0, 3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SP_W2_4R 2, 4<br>
-FILTER_VER_CHROMA_SP_W2_4R 2, 8<br>
-<br>
-FILTER_VER_CHROMA_SP_W2_4R 2, 16<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_sp_4x2, 5, 6, 5<br>
-<br>
- add r1d, r1d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m4, [tab_c_526336]<br>
-<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m2, [r0]<br>
- punpcklwd m1, m2 ;m1=[1 2]<br>
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2<br>
-<br>
- movq m3, [r0 + r1]<br>
- punpcklwd m2, m3 ;m4=[2 3]<br>
- pmaddwd m2, [r5 + 1 * 16]<br>
- paddd m0, m2 ;m0=[0+1+2+3] Row1 done<br>
- paddd m0, m4<br>
- psrad m0, 12<br>
-<br>
- movq m2, [r0 + 2 * r1]<br>
- punpcklwd m3, m2 ;m5=[3 4]<br>
- pmaddwd m3, [r5 + 1 * 16]<br>
- paddd m1, m3 ;m1 = [1+2+3+4] Row2 done<br>
- paddd m1, m4<br>
- psrad m1, 12<br>
-<br>
- packssdw m0, m1<br>
- packuswb m0, m0<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
-<br>
- RET<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SP_W6_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7<br>
-<br>
- add r1d, r1d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m6, [tab_c_526336]<br>
-<br>
- mov r4d, %2/4<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W4_4R<br>
-<br>
- paddd m0, m6<br>
- paddd m1, m6<br>
- paddd m2, m6<br>
- paddd m3, m6<br>
-<br>
- psrad m0, 12<br>
- psrad m1, 12<br>
- psrad m2, 12<br>
- psrad m3, 12<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- packuswb m0, m2<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
- lea r5, [r2 + 2 * r3]<br>
- pextrd [r5], m0, 2<br>
- pextrd [r5 + r3], m0, 3<br>
-<br>
- lea r5, [4 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 4<br>
-<br>
- PROCESS_CHROMA_SP_W2_4R r6<br>
-<br>
- paddd m0, m6<br>
- paddd m2, m6<br>
-<br>
- psrad m0, 12<br>
- psrad m2, 12<br>
-<br>
- packssdw m0, m2<br>
- packuswb m0, m0<br>
-<br>
- pextrw [r2], m0, 0<br>
- pextrw [r2 + r3], m0, 1<br>
- lea r2, [r2 + 2 * r3]<br>
- pextrw [r2], m0, 2<br>
- pextrw [r2 + r3], m0, 3<br>
-<br>
- sub r0, 2 * 4<br>
- lea r2, [r2 + 2 * r3 - 4]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SP_W6_H4 6, 8<br>
-<br>
-FILTER_VER_CHROMA_SP_W6_H4 6, 16<br>
-<br>
-%macro PROCESS_CHROMA_SP_W8_2R 0<br>
- movu m1, [r0]<br>
- movu m3, [r0 + r1]<br>
- punpcklwd m0, m1, m3<br>
- pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l<br>
- punpckhwd m1, m3<br>
- pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h<br>
-<br>
- movu m4, [r0 + 2 * r1]<br>
- punpcklwd m2, m3, m4<br>
- pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l<br>
- punpckhwd m3, m4<br>
- pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movu m5, [r0 + r1]<br>
- punpcklwd m6, m4, m5<br>
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l<br>
- paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum<br>
- punpckhwd m4, m5<br>
- pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h<br>
- paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum<br>
-<br>
- movu m4, [r0 + 2 * r1]<br>
- punpcklwd m6, m5, m4<br>
- pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l<br>
- paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum<br>
- punpckhwd m5, m4<br>
- pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h<br>
- paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum<br>
-%endmacro<br>
-<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
-;--------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SP_W8_H2 2<br>
-INIT_XMM sse2<br>
-cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8<br>
-<br>
- add r1d, r1d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mova m7, [tab_c_526336]<br>
-<br>
- mov r4d, %2/2<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W8_2R<br>
-<br>
- paddd m0, m7<br>
- paddd m1, m7<br>
- paddd m2, m7<br>
- paddd m3, m7<br>
-<br>
- psrad m0, 12<br>
- psrad m1, 12<br>
- psrad m2, 12<br>
- psrad m3, 12<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- packuswb m0, m2<br>
-<br>
- movlps [r2], m0<br>
- movhps [r2 + r3], m0<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 2<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 4<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 6<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 8<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 16<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 32<br>
-<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 12<br>
-FILTER_VER_CHROMA_SP_W8_H2 8, 64<br>
-<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_HORIZ_CHROMA_2xN 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride<br>
-%define coef2 m3<br>
-%define Tm0 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
- dec srcq<br>
- mov r4d, r4m<br>
- add dststrided, dststrided<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_ChromaCoeff]<br>
- movd coef2, [r6 + r4 * 4]<br>
-%else<br>
- movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufd coef2, coef2, 0<br>
- mova t1, [pw_2000]<br>
- mova Tm0, [tab_Tm]<br>
-<br>
- mov r4d, %2<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- sub srcq, srcstrideq<br>
- add r4d, 3<br>
-<br>
-.loopH:<br>
- movh t0, [srcq]<br>
- pshufb t0, t0, Tm0<br>
- pmaddubsw t0, coef2<br>
- phaddw t0, t0<br>
- psubw t0, t1<br>
- movd [dstq], t0<br>
-<br>
- lea srcq, [srcq + srcstrideq]<br>
- lea dstq, [dstq + dststrideq]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_HORIZ_CHROMA_2xN 2, 4<br>
-FILTER_HORIZ_CHROMA_2xN 2, 8<br>
-<br>
-FILTER_HORIZ_CHROMA_2xN 2, 16<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_HORIZ_CHROMA_4xN 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride<br>
-%define coef2 m3<br>
-%define Tm0 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
- dec srcq<br>
- mov r4d, r4m<br>
- add dststrided, dststrided<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_ChromaCoeff]<br>
- movd coef2, [r6 + r4 * 4]<br>
-%else<br>
- movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufd coef2, coef2, 0<br>
- mova t1, [pw_2000]<br>
- mova Tm0, [tab_Tm]<br>
-<br>
- mov r4d, %2<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- sub srcq, srcstrideq<br>
- add r4d, 3<br>
-<br>
-.loopH:<br>
- movh t0, [srcq]<br>
- pshufb t0, t0, Tm0<br>
- pmaddubsw t0, coef2<br>
- phaddw t0, t0<br>
- psubw t0, t1<br>
- movlps [dstq], t0<br>
-<br>
- lea srcq, [srcq + srcstrideq]<br>
- lea dstq, [dstq + dststrideq]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_HORIZ_CHROMA_4xN 4, 2<br>
-FILTER_HORIZ_CHROMA_4xN 4, 4<br>
-FILTER_HORIZ_CHROMA_4xN 4, 8<br>
-FILTER_HORIZ_CHROMA_4xN 4, 16<br>
-<br>
-FILTER_HORIZ_CHROMA_4xN 4, 32<br>
-<br>
-%macro PROCESS_CHROMA_W6 3<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- psubw %2, %3<br>
- movh [dstq], %2<br>
- pshufd %2, %2, 2<br>
- movd [dstq + 8], %2<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W12 3<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- psubw %2, %3<br>
- movu [dstq], %2<br>
- movu %1, [srcq + 8]<br>
- pshufb %1, %1, Tm0<br>
- pmaddubsw %1, coef2<br>
- phaddw %1, %1<br>
- psubw %1, %3<br>
- movh [dstq + 16], %1<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_HORIZ_CHROMA 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride<br>
-%define coef2 m5<br>
-%define Tm0 m4<br>
-%define Tm1 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
- dec srcq<br>
- mov r4d, r4m<br>
- add dststrided, dststrided<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_ChromaCoeff]<br>
- movd coef2, [r6 + r4 * 4]<br>
-%else<br>
- movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufd coef2, coef2, 0<br>
- mova t2, [pw_2000]<br>
- mova Tm0, [tab_Tm]<br>
- mova Tm1, [tab_Tm + 16]<br>
-<br>
- mov r4d, %2<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- sub srcq, srcstrideq<br>
- add r4d, 3<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_W%1 t0, t1, t2<br>
- add srcq, srcstrideq<br>
- add dstq, dststrideq<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_HORIZ_CHROMA 6, 8<br>
-FILTER_HORIZ_CHROMA 12, 16<br>
-<br>
-FILTER_HORIZ_CHROMA 6, 16<br>
-FILTER_HORIZ_CHROMA 12, 32<br>
-<br>
-%macro PROCESS_CHROMA_W8 3<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- psubw %2, %3<br>
- movu [dstq], %2<br>
-%endmacro<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;-----------------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_HORIZ_CHROMA_8xN 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride<br>
-%define coef2 m5<br>
-%define Tm0 m4<br>
-%define Tm1 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
- dec srcq<br>
- mov r4d, r4m<br>
- add dststrided, dststrided<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_ChromaCoeff]<br>
- movd coef2, [r6 + r4 * 4]<br>
-%else<br>
- movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufd coef2, coef2, 0<br>
- mova t2, [pw_2000]<br>
- mova Tm0, [tab_Tm]<br>
- mova Tm1, [tab_Tm + 16]<br>
-<br>
- mov r4d, %2<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- sub srcq, srcstrideq<br>
- add r4d, 3<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_W8 t0, t1, t2<br>
- add srcq, srcstrideq<br>
- add dstq, dststrideq<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_HORIZ_CHROMA_8xN 8, 2<br>
-FILTER_HORIZ_CHROMA_8xN 8, 4<br>
-FILTER_HORIZ_CHROMA_8xN 8, 6<br>
-FILTER_HORIZ_CHROMA_8xN 8, 8<br>
-FILTER_HORIZ_CHROMA_8xN 8, 16<br>
-FILTER_HORIZ_CHROMA_8xN 8, 32<br>
-<br>
-FILTER_HORIZ_CHROMA_8xN 8, 12<br>
-FILTER_HORIZ_CHROMA_8xN 8, 64<br>
-<br>
-%macro PROCESS_CHROMA_W16 4<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- psubw %2, %3<br>
- psubw %4, %3<br>
- movu [dstq], %2<br>
- movu [dstq + 16], %4<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W24 4<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- psubw %2, %3<br>
- psubw %4, %3<br>
- movu [dstq], %2<br>
- movu [dstq + 16], %4<br>
- movu %1, [srcq + 16]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- psubw %2, %3<br>
- movu [dstq + 32], %2<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W32 4<br>
- movu %1, [srcq]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- psubw %2, %3<br>
- psubw %4, %3<br>
- movu [dstq], %2<br>
- movu [dstq + 16], %4<br>
- movu %1, [srcq + 16]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + 24]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- psubw %2, %3<br>
- psubw %4, %3<br>
- movu [dstq + 32], %2<br>
- movu [dstq + 48], %4<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W16o 5<br>
- movu %1, [srcq + %5]<br>
- pshufb %2, %1, Tm0<br>
- pmaddubsw %2, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %2, %1<br>
- movu %1, [srcq + %5 + 8]<br>
- pshufb %4, %1, Tm0<br>
- pmaddubsw %4, coef2<br>
- pshufb %1, %1, Tm1<br>
- pmaddubsw %1, coef2<br>
- phaddw %4, %1<br>
- psubw %2, %3<br>
- psubw %4, %3<br>
- movu [dstq + %5 * 2], %2<br>
- movu [dstq + %5 * 2 + 16], %4<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W48 4<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 0<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 16<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 32<br>
-%endmacro<br>
-<br>
-%macro PROCESS_CHROMA_W64 4<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 0<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 16<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 32<br>
- PROCESS_CHROMA_W16o %1, %2, %3, %4, 48<br>
-%endmacro<br>
-<br>
-;------------------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
-;------------------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_HORIZ_CHROMA_WxN 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride<br>
-%define coef2 m6<br>
-%define Tm0 m5<br>
-%define Tm1 m4<br>
-%define t3 m3<br>
-%define t2 m2<br>
-%define t1 m1<br>
-%define t0 m0<br>
-<br>
- dec srcq<br>
- mov r4d, r4m<br>
- add dststrided, dststrided<br>
-<br>
-%ifdef PIC<br>
- lea r6, [tab_ChromaCoeff]<br>
- movd coef2, [r6 + r4 * 4]<br>
-%else<br>
- movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufd coef2, coef2, 0<br>
- mova t2, [pw_2000]<br>
- mova Tm0, [tab_Tm]<br>
- mova Tm1, [tab_Tm + 16]<br>
-<br>
- mov r4d, %2<br>
- cmp r5m, byte 0<br>
- je .loopH<br>
- sub srcq, srcstrideq<br>
- add r4d, 3<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_W%1 t0, t1, t2, t3<br>
- add srcq, srcstrideq<br>
- add dstq, dststrideq<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_HORIZ_CHROMA_WxN 16, 4<br>
-FILTER_HORIZ_CHROMA_WxN 16, 8<br>
-FILTER_HORIZ_CHROMA_WxN 16, 12<br>
-FILTER_HORIZ_CHROMA_WxN 16, 16<br>
-FILTER_HORIZ_CHROMA_WxN 16, 32<br>
-FILTER_HORIZ_CHROMA_WxN 24, 32<br>
-FILTER_HORIZ_CHROMA_WxN 32, 8<br>
-FILTER_HORIZ_CHROMA_WxN 32, 16<br>
-FILTER_HORIZ_CHROMA_WxN 32, 24<br>
-FILTER_HORIZ_CHROMA_WxN 32, 32<br>
-<br>
-FILTER_HORIZ_CHROMA_WxN 16, 24<br>
-FILTER_HORIZ_CHROMA_WxN 16, 64<br>
-FILTER_HORIZ_CHROMA_WxN 24, 64<br>
-FILTER_HORIZ_CHROMA_WxN 32, 48<br>
-FILTER_HORIZ_CHROMA_WxN 32, 64<br>
-<br>
-FILTER_HORIZ_CHROMA_WxN 64, 64<br>
-FILTER_HORIZ_CHROMA_WxN 64, 32<br>
-FILTER_HORIZ_CHROMA_WxN 64, 48<br>
-FILTER_HORIZ_CHROMA_WxN 48, 64<br>
-FILTER_HORIZ_CHROMA_WxN 64, 16<br>
-<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W16n 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m1, m0, [tab_Vm]<br>
- pshufb m0, [tab_Vm + 16]<br>
- mov r4d, %2/2<br>
-<br>
-.loop:<br>
-<br>
- mov r6d, %1/16<br>
-<br>
-.loopW:<br>
-<br>
- movu m2, [r0]<br>
- movu m3, [r0 + r1]<br>
-<br>
- punpcklbw m4, m2, m3<br>
- punpckhbw m2, m3<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m2, m1<br>
-<br>
- lea r5, [r0 + 2 * r1]<br>
- movu m5, [r5]<br>
- movu m7, [r5 + r1]<br>
-<br>
- punpcklbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m4, m6<br>
-<br>
- punpckhbw m6, m5, m7<br>
- pmaddubsw m6, m0<br>
- paddw m2, m6<br>
-<br>
- mova m6, [pw_2000]<br>
-<br>
- psubw m4, m6<br>
- psubw m2, m6<br>
-<br>
- movu [r2], m4<br>
- movu [r2 + 16], m2<br>
-<br>
- punpcklbw m4, m3, m5<br>
- punpckhbw m3, m5<br>
-<br>
- pmaddubsw m4, m1<br>
- pmaddubsw m3, m1<br>
-<br>
- movu m5, [r5 + 2 * r1]<br>
-<br>
- punpcklbw m2, m7, m5<br>
- punpckhbw m7, m5<br>
-<br>
- pmaddubsw m2, m0<br>
- pmaddubsw m7, m0<br>
-<br>
- paddw m4, m2<br>
- paddw m3, m7<br>
-<br>
- psubw m4, m6<br>
- psubw m3, m6<br>
-<br>
- movu [r2 + r3], m4<br>
- movu [r2 + r3 + 16], m3<br>
-<br>
- add r0, 16<br>
- add r2, 32<br>
- dec r6d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + r1 * 2 - %1]<br>
- lea r2, [r2 + r3 * 2 - %1 * 2]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W16n 64, 64<br>
-FILTER_V_PS_W16n 64, 32<br>
-FILTER_V_PS_W16n 64, 48<br>
-FILTER_V_PS_W16n 48, 64<br>
-FILTER_V_PS_W16n 64, 16<br>
-<br>
-<br>
-;------------------------------------------------------------------------------------------------------------<br>
-;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;------------------------------------------------------------------------------------------------------------<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_2x4, 4, 6, 7<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m0, [tab_Cm]<br>
-<br>
- lea r5, [3 * r1]<br>
-<br>
- movd m2, [r0]<br>
- movd m3, [r0 + r1]<br>
- movd m4, [r0 + 2 * r1]<br>
- movd m5, [r0 + r5]<br>
-<br>
- punpcklbw m2, m3<br>
- punpcklbw m6, m4, m5<br>
- punpcklbw m2, m6<br>
-<br>
- pmaddubsw m2, m0<br>
-<br>
- lea r0, [r0 + 4 * r1]<br>
- movd m6, [r0]<br>
-<br>
- punpcklbw m3, m4<br>
- punpcklbw m1, m5, m6<br>
- punpcklbw m3, m1<br>
-<br>
- pmaddubsw m3, m0<br>
- phaddw m2, m3<br>
-<br>
- mova m1, [pw_2000]<br>
-<br>
- psubw m2, m1<br>
-<br>
- movd [r2], m2<br>
- pextrd [r2 + r3], m2, 2<br>
-<br>
- movd m2, [r0 + r1]<br>
-<br>
- punpcklbw m4, m5<br>
- punpcklbw m3, m6, m2<br>
- punpcklbw m4, m3<br>
-<br>
- pmaddubsw m4, m0<br>
-<br>
- movd m3, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m5, m6<br>
- punpcklbw m2, m3<br>
- punpcklbw m5, m2<br>
-<br>
- pmaddubsw m5, m0<br>
- phaddw m4, m5<br>
- psubw m4, m1<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
- movd [r2], m4<br>
- pextrd [r2 + r3], m4, 2<br>
-<br>
- RET<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_V_PS_W2 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8<br>
-<br>
- mov r4d, r4m<br>
- sub r0, r1<br>
- add r3d, r3d<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeff]<br>
- movd m0, [r5 + r4 * 4]<br>
-%else<br>
- movd m0, [tab_ChromaCoeff + r4 * 4]<br>
-%endif<br>
-<br>
- pshufb m0, [tab_Cm]<br>
-<br>
- mova m1, [pw_2000]<br>
- lea r5, [3 * r1]<br>
- mov r4d, %2/4<br>
-.loop:<br>
- movd m2, [r0]<br>
- movd m3, [r0 + r1]<br>
- movd m4, [r0 + 2 * r1]<br>
- movd m5, [r0 + r5]<br>
-<br>
- punpcklbw m2, m3<br>
- punpcklbw m6, m4, m5<br>
- punpcklbw m2, m6<br>
-<br>
- pmaddubsw m2, m0<br>
-<br>
- lea r0, [r0 + 4 * r1]<br>
- movd m6, [r0]<br>
-<br>
- punpcklbw m3, m4<br>
- punpcklbw m7, m5, m6<br>
- punpcklbw m3, m7<br>
-<br>
- pmaddubsw m3, m0<br>
-<br>
- phaddw m2, m3<br>
- psubw m2, m1<br>
-<br>
-<br>
- movd [r2], m2<br>
- pshufd m2, m2, 2<br>
- movd [r2 + r3], m2<br>
-<br>
- movd m2, [r0 + r1]<br>
-<br>
- punpcklbw m4, m5<br>
- punpcklbw m3, m6, m2<br>
- punpcklbw m4, m3<br>
-<br>
- pmaddubsw m4, m0<br>
-<br>
- movd m3, [r0 + 2 * r1]<br>
-<br>
- punpcklbw m5, m6<br>
- punpcklbw m2, m3<br>
- punpcklbw m5, m2<br>
-<br>
- pmaddubsw m5, m0<br>
-<br>
- phaddw m4, m5<br>
-<br>
- psubw m4, m1<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
- movd [r2], m4<br>
- pshufd m4 , m4 ,2<br>
- movd [r2 + r3], m4<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loop<br>
-<br>
-RET<br>
-%endmacro<br>
-<br>
-FILTER_V_PS_W2 2, 8<br>
-<br>
-FILTER_V_PS_W2 2, 16<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SS 2<br>
-INIT_XMM sse2<br>
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mov dword [rsp], %2/4<br>
-<br>
-.loopH:<br>
- mov r4d, (%1/4)<br>
-.loopW:<br>
- PROCESS_CHROMA_SP_W4_4R<br>
-<br>
- psrad m0, 6<br>
- psrad m1, 6<br>
- psrad m2, 6<br>
- psrad m3, 6<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- movlps [r2], m0<br>
- movhps [r2 + r3], m0<br>
- lea r5, [r2 + 2 * r3]<br>
- movlps [r5], m2<br>
- movhps [r5 + r3], m2<br>
-<br>
- lea r5, [4 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 2 * 4<br>
-<br>
- dec r4d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + 4 * r1 - 2 * %1]<br>
- lea r2, [r2 + 4 * r3 - 2 * %1]<br>
-<br>
- dec dword [rsp]<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
- FILTER_VER_CHROMA_SS 4, 4<br>
- FILTER_VER_CHROMA_SS 4, 8<br>
- FILTER_VER_CHROMA_SS 16, 16<br>
- FILTER_VER_CHROMA_SS 16, 8<br>
- FILTER_VER_CHROMA_SS 16, 12<br>
- FILTER_VER_CHROMA_SS 12, 16<br>
- FILTER_VER_CHROMA_SS 16, 4<br>
- FILTER_VER_CHROMA_SS 4, 16<br>
- FILTER_VER_CHROMA_SS 32, 32<br>
- FILTER_VER_CHROMA_SS 32, 16<br>
- FILTER_VER_CHROMA_SS 16, 32<br>
- FILTER_VER_CHROMA_SS 32, 24<br>
- FILTER_VER_CHROMA_SS 24, 32<br>
- FILTER_VER_CHROMA_SS 32, 8<br>
-<br>
- FILTER_VER_CHROMA_SS 16, 24<br>
- FILTER_VER_CHROMA_SS 12, 32<br>
- FILTER_VER_CHROMA_SS 4, 32<br>
- FILTER_VER_CHROMA_SS 32, 64<br>
- FILTER_VER_CHROMA_SS 16, 64<br>
- FILTER_VER_CHROMA_SS 32, 48<br>
- FILTER_VER_CHROMA_SS 24, 64<br>
-<br>
- FILTER_VER_CHROMA_SS 64, 64<br>
- FILTER_VER_CHROMA_SS 64, 32<br>
- FILTER_VER_CHROMA_SS 64, 48<br>
- FILTER_VER_CHROMA_SS 48, 64<br>
- FILTER_VER_CHROMA_SS 64, 16<br>
-<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SS_W2_4R 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mov r4d, (%2/4)<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W2_4R r5<br>
-<br>
- psrad m0, 6<br>
- psrad m2, 6<br>
-<br>
- packssdw m0, m2<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
- lea r2, [r2 + 2 * r3]<br>
- pextrd [r2], m0, 2<br>
- pextrd [r2 + r3], m0, 3<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SS_W2_4R 2, 4<br>
-FILTER_VER_CHROMA_SS_W2_4R 2, 8<br>
-<br>
-FILTER_VER_CHROMA_SS_W2_4R 2, 16<br>
-<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;---------------------------------------------------------------------------------------------------------------<br>
-INIT_XMM sse2<br>
-cglobal interp_4tap_vert_ss_4x2, 5, 6, 4<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m2, [r0]<br>
- punpcklwd m1, m2 ;m1=[1 2]<br>
- pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2<br>
-<br>
- movq m3, [r0 + r1]<br>
- punpcklwd m2, m3 ;m4=[2 3]<br>
- pmaddwd m2, [r5 + 1 * 16]<br>
- paddd m0, m2 ;m0=[0+1+2+3] Row1 done<br>
- psrad m0, 6<br>
-<br>
- movq m2, [r0 + 2 * r1]<br>
- punpcklwd m3, m2 ;m5=[3 4]<br>
- pmaddwd m3, [r5 + 1 * 16]<br>
- paddd m1, m3 ;m1=[1+2+3+4] Row2 done<br>
- psrad m1, 6<br>
-<br>
- packssdw m0, m1<br>
-<br>
- movlps [r2], m0<br>
- movhps [r2 + r3], m0<br>
-<br>
- RET<br>
-<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-------------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SS_W6_H4 2<br>
-INIT_XMM sse4<br>
-cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mov r4d, %2/4<br>
-<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W4_4R<br>
-<br>
- psrad m0, 6<br>
- psrad m1, 6<br>
- psrad m2, 6<br>
- psrad m3, 6<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- movlps [r2], m0<br>
- movhps [r2 + r3], m0<br>
- lea r5, [r2 + 2 * r3]<br>
- movlps [r5], m2<br>
- movhps [r5 + r3], m2<br>
-<br>
- lea r5, [4 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 2 * 4<br>
-<br>
- PROCESS_CHROMA_SP_W2_4R r6<br>
-<br>
- psrad m0, 6<br>
- psrad m2, 6<br>
-<br>
- packssdw m0, m2<br>
-<br>
- movd [r2], m0<br>
- pextrd [r2 + r3], m0, 1<br>
- lea r2, [r2 + 2 * r3]<br>
- pextrd [r2], m0, 2<br>
- pextrd [r2 + r3], m0, 3<br>
-<br>
- sub r0, 2 * 4<br>
- lea r2, [r2 + 2 * r3 - 2 * 4]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SS_W6_H4 6, 8<br>
-<br>
-FILTER_VER_CHROMA_SS_W6_H4 6, 16<br>
-<br>
-<br>
-;----------------------------------------------------------------------------------------------------------------<br>
-; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;----------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_CHROMA_SS_W8_H2 2<br>
-INIT_XMM sse2<br>
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- sub r0, r1<br>
- shl r4d, 5<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_ChromaCoeffV]<br>
- lea r5, [r5 + r4]<br>
-%else<br>
- lea r5, [tab_ChromaCoeffV + r4]<br>
-%endif<br>
-<br>
- mov r4d, %2/2<br>
-.loopH:<br>
- PROCESS_CHROMA_SP_W8_2R<br>
-<br>
- psrad m0, 6<br>
- psrad m1, 6<br>
- psrad m2, 6<br>
- psrad m3, 6<br>
-<br>
- packssdw m0, m1<br>
- packssdw m2, m3<br>
-<br>
- movu [r2], m0<br>
- movu [r2 + r3], m2<br>
-<br>
- lea r2, [r2 + 2 * r3]<br>
-<br>
- dec r4d<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 2<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 4<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 6<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 8<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 16<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 32<br>
-<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 12<br>
-FILTER_VER_CHROMA_SS_W8_H2 8, 64<br>
-<br>
-;-----------------------------------------------------------------------------------------------------------------<br>
-; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
-;-----------------------------------------------------------------------------------------------------------------<br>
-%macro FILTER_VER_LUMA_SS 2<br>
-INIT_XMM sse2<br>
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize<br>
-<br>
- add r1d, r1d<br>
- add r3d, r3d<br>
- lea r5, [3 * r1]<br>
- sub r0, r5<br>
- shl r4d, 6<br>
-<br>
-%ifdef PIC<br>
- lea r5, [tab_LumaCoeffV]<br>
- lea r6, [r5 + r4]<br>
-%else<br>
- lea r6, [tab_LumaCoeffV + r4]<br>
-%endif<br>
-<br>
- mov dword [rsp], %2/4<br>
-.loopH:<br>
- mov r4d, (%1/4)<br>
-.loopW:<br>
- movq m0, [r0]<br>
- movq m1, [r0 + r1]<br>
- punpcklwd m0, m1 ;m0=[0 1]<br>
- pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m1, m4 ;m1=[1 2]<br>
- pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[2 3]<br>
- pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
- pmaddwd m4, [r6 + 1 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[3 4]<br>
- pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
- pmaddwd m5, [r6 + 1 * 16]<br>
- paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[4 5]<br>
- pmaddwd m6, m4, [r6 + 1 * 16]<br>
- paddd m2, m6 ;m2=[2+3+4+5] Row3<br>
- pmaddwd m4, [r6 + 2 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[5 6]<br>
- pmaddwd m6, m5, [r6 + 1 * 16]<br>
- paddd m3, m6 ;m3=[3+4+5+6] Row4<br>
- pmaddwd m5, [r6 + 2 * 16]<br>
- paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[6 7]<br>
- pmaddwd m6, m4, [r6 + 2 * 16]<br>
- paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3<br>
- pmaddwd m4, [r6 + 3 * 16]<br>
- paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end<br>
- psrad m0, 6<br>
-<br>
- lea r0, [r0 + 2 * r1]<br>
- movq m4, [r0]<br>
- punpcklwd m5, m4 ;m5=[7 8]<br>
- pmaddwd m6, m5, [r6 + 2 * 16]<br>
- paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4<br>
- pmaddwd m5, [r6 + 3 * 16]<br>
- paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end<br>
- psrad m1, 6<br>
-<br>
- packssdw m0, m1<br>
-<br>
- movlps [r2], m0<br>
- movhps [r2 + r3], m0<br>
-<br>
- movq m5, [r0 + r1]<br>
- punpcklwd m4, m5 ;m4=[8 9]<br>
- pmaddwd m4, [r6 + 3 * 16]<br>
- paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end<br>
- psrad m2, 6<br>
-<br>
- movq m4, [r0 + 2 * r1]<br>
- punpcklwd m5, m4 ;m5=[9 10]<br>
- pmaddwd m5, [r6 + 3 * 16]<br>
- paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end<br>
- psrad m3, 6<br>
-<br>
- packssdw m2, m3<br>
-<br>
- movlps [r2 + 2 * r3], m2<br>
- lea r5, [3 * r3]<br>
- movhps [r2 + r5], m2<br>
-<br>
- lea r5, [8 * r1 - 2 * 4]<br>
- sub r0, r5<br>
- add r2, 2 * 4<br>
-<br>
- dec r4d<br>
- jnz .loopW<br>
-<br>
- lea r0, [r0 + 4 * r1 - 2 * %1]<br>
- lea r2, [r2 + 4 * r3 - 2 * %1]<br>
-<br>
- dec dword [rsp]<br>
- jnz .loopH<br>
-<br>
- RET<br>
-%endmacro<br>
-<br>
- FILTER_VER_LUMA_SS 4, 4<br>
- FILTER_VER_LUMA_SS 8, 8<br>
- FILTER_VER_LUMA_SS 8, 4<br>
- FILTER_VER_LUMA_SS 4, 8<br>
- FILTER_VER_LUMA_SS 16, 16<br>
- FILTER_VER_LUMA_SS 16, 8<br>
- FILTER_VER_LUMA_SS 8, 16<br>
- FILTER_VER_LUMA_SS 16, 12<br>
- FILTER_VER_LUMA_SS 12, 16<br>
- FILTER_VER_LUMA_SS 16, 4<br>
- FILTER_VER_LUMA_SS 4, 16<br>
- FILTER_VER_LUMA_SS 32, 32<br>
- FILTER_VER_LUMA_SS 32, 16<br>
- FILTER_VER_LUMA_SS 16, 32<br>
- FILTER_VER_LUMA_SS 32, 24<br>
- FILTER_VER_LUMA_SS 24, 32<br>
- FILTER_VER_LUMA_SS 32, 8<br>
- FILTER_VER_LUMA_SS 8, 32<br>
- FILTER_VER_LUMA_SS 64, 64<br>
- FILTER_VER_LUMA_SS 64, 32<br>
- FILTER_VER_LUMA_SS 32, 64<br>
- FILTER_VER_LUMA_SS 64, 48<br>
- FILTER_VER_LUMA_SS 48, 64<br>
- FILTER_VER_LUMA_SS 64, 16<br>
- FILTER_VER_LUMA_SS 16, 64<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro IPFILTER_LUMA_PP_W8 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_horiz_pp_%1x%2, 4,6,7<br>
+ mov r4d, r4m<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeff]<br>
+ movh m3, [r5 + r4 * 8]<br>
+%else<br>
+ movh m3, [tab_LumaCoeff + r4 * 8]<br>
+%endif<br>
+ pshufd m0, m3, 0 ; m0 = coeff-L<br>
+ pshufd m1, m3, 0x55 ; m1 = coeff-H<br>
+ lea r5, [tab_Tm] ; r5 = shuffle<br>
+ mova m2, [pw_512] ; m2 = 512<br>
+<br>
+ mov r4d, %2<br>
+.loopH:<br>
+%assign x 0<br>
+%rep %1 / 8<br>
+ movu m3, [r0 - 3 + x] ; m3 = [F E D C B A 9 8 7 6 5 4 3 2 1 0]<br>
+ pshufb m4, m3, [r5 + 0*16] ; m4 = [6 5 4 3 5 4 3 2 4 3 2 1 3 2 1 0]<br>
+ pshufb m5, m3, [r5 + 1*16] ; m5 = [A 9 8 7 9 8 7 6 8 7 6 5 7 6 5 4]<br>
+ pshufb m3, [r5 + 2*16] ; m3 = [E D C B D C B A C B A 9 B A 9 8]<br>
+ pmaddubsw m4, m0<br>
+ pmaddubsw m6, m5, m1<br>
+ pmaddubsw m5, m0<br>
+ pmaddubsw m3, m1<br>
+ paddw m4, m6<br>
+ paddw m5, m3<br>
+ phaddw m4, m5<br>
+ pmulhrsw m4, m2<br>
+ packuswb m4, m4<br>
+ movh [r2 + x], m4<br>
+%assign x x+8<br>
+%endrep<br>
+<br>
+ add r0, r1<br>
+ add r2, r3<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+ RET<br>
+%endmacro<br>
+<br>
+IPFILTER_LUMA_PP_W8 8, 4<br>
+IPFILTER_LUMA_PP_W8 8, 8<br>
+IPFILTER_LUMA_PP_W8 8, 16<br>
+IPFILTER_LUMA_PP_W8 8, 32<br>
+IPFILTER_LUMA_PP_W8 16, 4<br>
+IPFILTER_LUMA_PP_W8 16, 8<br>
+IPFILTER_LUMA_PP_W8 16, 12<br>
+IPFILTER_LUMA_PP_W8 16, 16<br>
+IPFILTER_LUMA_PP_W8 16, 32<br>
+IPFILTER_LUMA_PP_W8 16, 64<br>
+IPFILTER_LUMA_PP_W8 24, 32<br>
+IPFILTER_LUMA_PP_W8 32, 8<br>
+IPFILTER_LUMA_PP_W8 32, 16<br>
+IPFILTER_LUMA_PP_W8 32, 24<br>
+IPFILTER_LUMA_PP_W8 32, 32<br>
+IPFILTER_LUMA_PP_W8 32, 64<br>
+IPFILTER_LUMA_PP_W8 48, 64<br>
+IPFILTER_LUMA_PP_W8 64, 16<br>
+IPFILTER_LUMA_PP_W8 64, 32<br>
+IPFILTER_LUMA_PP_W8 64, 48<br>
+IPFILTER_LUMA_PP_W8 64, 64<br>
+<br>
+;----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;----------------------------------------------------------------------------------------------------------------------------<br>
+ IPFILTER_LUMA 4, 4, ps<br>
+ IPFILTER_LUMA 8, 8, ps<br>
+ IPFILTER_LUMA 8, 4, ps<br>
+ IPFILTER_LUMA 4, 8, ps<br>
+ IPFILTER_LUMA 16, 16, ps<br>
+ IPFILTER_LUMA 16, 8, ps<br>
+ IPFILTER_LUMA 8, 16, ps<br>
+ IPFILTER_LUMA 16, 12, ps<br>
+ IPFILTER_LUMA 12, 16, ps<br>
+ IPFILTER_LUMA 16, 4, ps<br>
+ IPFILTER_LUMA 4, 16, ps<br>
+ IPFILTER_LUMA 32, 32, ps<br>
+ IPFILTER_LUMA 32, 16, ps<br>
+ IPFILTER_LUMA 16, 32, ps<br>
+ IPFILTER_LUMA 32, 24, ps<br>
+ IPFILTER_LUMA 24, 32, ps<br>
+ IPFILTER_LUMA 32, 8, ps<br>
+ IPFILTER_LUMA 8, 32, ps<br>
+ IPFILTER_LUMA 64, 64, ps<br>
+ IPFILTER_LUMA 64, 32, ps<br>
+ IPFILTER_LUMA 32, 64, ps<br>
+ IPFILTER_LUMA 64, 48, ps<br>
+ IPFILTER_LUMA 48, 64, ps<br>
+ IPFILTER_LUMA 64, 16, ps<br>
+ IPFILTER_LUMA 16, 64, ps<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; Interpolate HV<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]<br>
+ mova %5, [r0 + (%6 + 0) * 16]<br>
+ mova %1, [r0 + (%6 + 1) * 16]<br>
+ mova %2, [r0 + (%6 + 2) * 16]<br>
+ punpcklwd %3, %5, %1<br>
+ punpckhwd %5, %1<br>
+ pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0<br>
+ pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]<br>
+ punpcklwd %4, %1, %2<br>
+ punpckhwd %1, %2<br>
+ pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1<br>
+ pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]<br>
+%endmacro ; FILTER_HV8_START<br>
+<br>
+%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]<br>
+ mova %8, [r0 + (%9 + 0) * 16]<br>
+ mova %1, [r0 + (%9 + 1) * 16]<br>
+ punpcklwd %7, %2, %8<br>
+ punpckhwd %2, %8<br>
+ pmaddwd %7, [r5 + %10 * 16]<br>
+ pmaddwd %2, [r5 + %10 * 16]<br>
+ paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0<br>
+ paddd %5, %2 ; R0 = H[0+1+2+3]<br>
+ punpcklwd %7, %8, %1<br>
+ punpckhwd %8, %1<br>
+ pmaddwd %7, [r5 + %10 * 16]<br>
+ pmaddwd %8, [r5 + %10 * 16]<br>
+ paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1<br>
+ paddd %6, %8 ; R1 = H[1+2+3+4]<br>
+%endmacro ; FILTER_HV8_MID<br>
+<br>
+; Round and Saturate<br>
+%macro FILTER_HV8_END 4 ; output in [1, 3]<br>
+ paddd %1, [tab_c_526336]<br>
+ paddd %2, [tab_c_526336]<br>
+ paddd %3, [tab_c_526336]<br>
+ paddd %4, [tab_c_526336]<br>
+ psrad %1, 12<br>
+ psrad %2, 12<br>
+ psrad %3, 12<br>
+ psrad %4, 12<br>
+ packssdw %1, %2<br>
+ packssdw %3, %4<br>
+<br>
+ ; TODO: is merge better? I think this way is short dependency link<br>
+ packuswb %1, %3<br>
+%endmacro ; FILTER_HV8_END<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM ssse3<br>
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16<br>
+%define coef m7<br>
+%define stk_buf rsp<br>
+<br>
+ mov r4d, r4m<br>
+ mov r5d, r5m<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_LumaCoeff]<br>
+ movh coef, [r6 + r4 * 8]<br>
+%else<br>
+ movh coef, [tab_LumaCoeff + r4 * 8]<br>
+%endif<br>
+ punpcklqdq coef, coef<br>
+<br>
+ ; move to row -3<br>
+ lea r6, [r1 + r1 * 2]<br>
+ sub r0, r6<br>
+<br>
+ xor r6, r6<br>
+ mov r4, rsp<br>
+<br>
+.loopH:<br>
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [pw_512], [r0 - 3]<br>
+ psubw m1, [pw_2000]<br>
+ mova [r4], m1<br>
+<br>
+ add r0, r1<br>
+ add r4, 16<br>
+ inc r6<br>
+ cmp r6, 8+7<br>
+ jnz .loopH<br>
+<br>
+ ; ready to phase V<br>
+ ; Here all of mN is free<br>
+<br>
+ ; load coeff table<br>
+ shl r5, 6<br>
+ lea r6, [tab_LumaCoeffV]<br>
+ lea r5, [r5 + r6]<br>
+<br>
+ ; load intermedia buffer<br>
+ mov r0, stk_buf<br>
+<br>
+ ; register mapping<br>
+ ; r0 - src<br>
+ ; r5 - coeff<br>
+ ; r6 - loop_i<br>
+<br>
+ ; let's go<br>
+ xor r6, r6<br>
+<br>
+ ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache<br>
+.loopV:<br>
+<br>
+ FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0<br>
+ FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1<br>
+ FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2<br>
+ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3<br>
+ FILTER_HV8_END m3, m0, m4, m1<br>
+<br>
+ movh [r2], m3<br>
+ movhps [r2 + r3], m3<br>
+<br>
+ lea r0, [r0 + 16 * 2]<br>
+ lea r2, [r2 + r3 * 2]<br>
+<br>
+ inc r6<br>
+ cmp r6, 8/2<br>
+ jnz .loopV<br>
+<br>
+ RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_2x4, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+lea r4, [r1 * 3]<br>
+lea r5, [r0 + 4 * r1]<br>
+pshufb m0, [tab_Cm]<br>
+mova m1, [pw_512]<br>
+<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+movd m4, [r0 + 2 * r1]<br>
+movd m5, [r0 + r4]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m6, m4, m5<br>
+punpcklbw m2, m6<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+movd m6, [r5]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m7, m5, m6<br>
+punpcklbw m3, m7<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+pmulhrsw m2, m1<br>
+<br>
+movd m7, [r5 + r1]<br>
+<br>
+punpcklbw m4, m5<br>
+punpcklbw m3, m6, m7<br>
+punpcklbw m4, m3<br>
+<br>
+pmaddubsw m4, m0<br>
+<br>
+movd m3, [r5 + 2 * r1]<br>
+<br>
+punpcklbw m5, m6<br>
+punpcklbw m7, m3<br>
+punpcklbw m5, m7<br>
+<br>
+pmaddubsw m5, m0<br>
+<br>
+phaddw m4, m5<br>
+<br>
+pmulhrsw m4, m1<br>
+packuswb m2, m4<br>
+<br>
+pextrw [r2], m2, 0<br>
+pextrw [r2 + r3], m2, 2<br>
+lea r2, [r2 + 2 * r3]<br>
+pextrw [r2], m2, 4<br>
+pextrw [r2 + r3], m2, 6<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W2_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_2x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m0, [tab_Cm]<br>
+<br>
+mova m1, [pw_512]<br>
+<br>
+mov r4d, %2<br>
+lea r5, [3 * r1]<br>
+<br>
+.loop:<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+movd m4, [r0 + 2 * r1]<br>
+movd m5, [r0 + r5]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m6, m4, m5<br>
+punpcklbw m2, m6<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+lea r0, [r0 + 4 * r1]<br>
+movd m6, [r0]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m7, m5, m6<br>
+punpcklbw m3, m7<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+pmulhrsw m2, m1<br>
+<br>
+movd m7, [r0 + r1]<br>
+<br>
+punpcklbw m4, m5<br>
+punpcklbw m3, m6, m7<br>
+punpcklbw m4, m3<br>
+<br>
+pmaddubsw m4, m0<br>
+<br>
+movd m3, [r0 + 2 * r1]<br>
+<br>
+punpcklbw m5, m6<br>
+punpcklbw m7, m3<br>
+punpcklbw m5, m7<br>
+<br>
+pmaddubsw m5, m0<br>
+<br>
+phaddw m4, m5<br>
+<br>
+pmulhrsw m4, m1<br>
+packuswb m2, m4<br>
+<br>
+pextrw [r2], m2, 0<br>
+pextrw [r2 + r3], m2, 2<br>
+lea r2, [r2 + 2 * r3]<br>
+pextrw [r2], m2, 4<br>
+pextrw [r2 + r3], m2, 6<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 4<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W2_H4 2, 8<br>
+<br>
+FILTER_V4_W2_H4 2, 16<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_4x2, 4, 6, 6<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m0, [tab_Cm]<br>
+lea r5, [r0 + 2 * r1]<br>
+<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+movd m4, [r5]<br>
+movd m5, [r5 + r1]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m1, m4, m5<br>
+punpcklbw m2, m1<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+movd m1, [r0 + 4 * r1]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m5, m1<br>
+punpcklbw m3, m5<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+pmulhrsw m2, [pw_512]<br>
+packuswb m2, m2<br>
+movd [r2], m2<br>
+pextrd [r2 + r3], m2, 1<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_4x4, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m0, [tab_Cm]<br>
+mova m1, [pw_512]<br>
+lea r5, [r0 + 4 * r1]<br>
+lea r4, [r1 * 3]<br>
+<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+movd m4, [r0 + 2 * r1]<br>
+movd m5, [r0 + r4]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m6, m4, m5<br>
+punpcklbw m2, m6<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+movd m6, [r5]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m7, m5, m6<br>
+punpcklbw m3, m7<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+pmulhrsw m2, m1<br>
+<br>
+movd m7, [r5 + r1]<br>
+<br>
+punpcklbw m4, m5<br>
+punpcklbw m3, m6, m7<br>
+punpcklbw m4, m3<br>
+<br>
+pmaddubsw m4, m0<br>
+<br>
+movd m3, [r5 + 2 * r1]<br>
+<br>
+punpcklbw m5, m6<br>
+punpcklbw m7, m3<br>
+punpcklbw m5, m7<br>
+<br>
+pmaddubsw m5, m0<br>
+<br>
+phaddw m4, m5<br>
+<br>
+pmulhrsw m4, m1<br>
+<br>
+packuswb m2, m4<br>
+movd [r2], m2<br>
+pextrd [r2 + r3], m2, 1<br>
+lea r2, [r2 + 2 * r3]<br>
+pextrd [r2], m2, 2<br>
+pextrd [r2 + r3], m2, 3<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W4_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m0, [tab_Cm]<br>
+<br>
+mova m1, [pw_512]<br>
+<br>
+mov r4d, %2<br>
+<br>
+lea r5, [3 * r1]<br>
+<br>
+.loop:<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+movd m4, [r0 + 2 * r1]<br>
+movd m5, [r0 + r5]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m6, m4, m5<br>
+punpcklbw m2, m6<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+lea r0, [r0 + 4 * r1]<br>
+movd m6, [r0]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m7, m5, m6<br>
+punpcklbw m3, m7<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+pmulhrsw m2, m1<br>
+<br>
+movd m7, [r0 + r1]<br>
+<br>
+punpcklbw m4, m5<br>
+punpcklbw m3, m6, m7<br>
+punpcklbw m4, m3<br>
+<br>
+pmaddubsw m4, m0<br>
+<br>
+movd m3, [r0 + 2 * r1]<br>
+<br>
+punpcklbw m5, m6<br>
+punpcklbw m7, m3<br>
+punpcklbw m5, m7<br>
+<br>
+pmaddubsw m5, m0<br>
+<br>
+phaddw m4, m5<br>
+<br>
+pmulhrsw m4, m1<br>
+packuswb m2, m4<br>
+movd [r2], m2<br>
+pextrd [r2 + r3], m2, 1<br>
+lea r2, [r2 + 2 * r3]<br>
+pextrd [r2], m2, 2<br>
+pextrd [r2 + r3], m2, 3<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 4<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W4_H4 4, 8<br>
+FILTER_V4_W4_H4 4, 16<br>
+<br>
+FILTER_V4_W4_H4 4, 32<br>
+<br>
+%macro FILTER_V4_W8_H2 0<br>
+punpcklbw m1, m2<br>
+punpcklbw m7, m3, m0<br>
+<br>
+pmaddubsw m1, m6<br>
+pmaddubsw m7, m5<br>
+<br>
+paddw m1, m7<br>
+<br>
+pmulhrsw m1, m4<br>
+packuswb m1, m1<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_H3 0<br>
+punpcklbw m2, m3<br>
+punpcklbw m7, m0, m1<br>
+<br>
+pmaddubsw m2, m6<br>
+pmaddubsw m7, m5<br>
+<br>
+paddw m2, m7<br>
+<br>
+pmulhrsw m2, m4<br>
+packuswb m2, m2<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_H4 0<br>
+punpcklbw m3, m0<br>
+punpcklbw m7, m1, m2<br>
+<br>
+pmaddubsw m3, m6<br>
+pmaddubsw m7, m5<br>
+<br>
+paddw m3, m7<br>
+<br>
+pmulhrsw m3, m4<br>
+packuswb m3, m3<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_H5 0<br>
+punpcklbw m0, m1<br>
+punpcklbw m7, m2, m3<br>
+<br>
+pmaddubsw m0, m6<br>
+pmaddubsw m7, m5<br>
+<br>
+paddw m0, m7<br>
+<br>
+pmulhrsw m0, m4<br>
+packuswb m0, m0<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_8x2 2<br>
+FILTER_V4_W8 %1, %2<br>
+movq m0, [r0 + 4 * r1]<br>
+<br>
+FILTER_V4_W8_H2<br>
+<br>
+movh [r2 + r3], m1<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_8x4 2<br>
+FILTER_V4_W8_8x2 %1, %2<br>
+;8x3<br>
+lea r6, [r0 + 4 * r1]<br>
+movq m1, [r6 + r1]<br>
+<br>
+FILTER_V4_W8_H3<br>
+<br>
+movh [r2 + 2 * r3], m2<br>
+<br>
+;8x4<br>
+movq m2, [r6 + 2 * r1]<br>
+<br>
+FILTER_V4_W8_H4<br>
+<br>
+lea r5, [r2 + 2 * r3]<br>
+movh [r5 + r3], m3<br>
+%endmacro<br>
+<br>
+%macro FILTER_V4_W8_8x6 2<br>
+FILTER_V4_W8_8x4 %1, %2<br>
+;8x5<br>
+lea r6, [r6 + 2 * r1]<br>
+movq m3, [r6 + r1]<br>
+<br>
+FILTER_V4_W8_H5<br>
+<br>
+movh [r2 + 4 * r3], m0<br>
+<br>
+;8x6<br>
+movq m0, [r0 + 8 * r1]<br>
+<br>
+FILTER_V4_W8_H2<br>
+<br>
+lea r5, [r2 + 4 * r3]<br>
+movh [r5 + r3], m1<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W8 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8<br>
+<br>
+mov r4d, r4m<br>
+<br>
+sub r0, r1<br>
+movq m0, [r0]<br>
+movq m1, [r0 + r1]<br>
+movq m2, [r0 + 2 * r1]<br>
+lea r5, [r0 + 2 * r1]<br>
+movq m3, [r5 + r1]<br>
+<br>
+punpcklbw m0, m1<br>
+punpcklbw m4, m2, m3<br>
+<br>
+%ifdef PIC<br>
+lea r6, [tab_ChromaCoeff]<br>
+movd m5, [r6 + r4 * 4]<br>
+%else<br>
+movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m6, m5, [tab_Vm]<br>
+pmaddubsw m0, m6<br>
+<br>
+pshufb m5, [tab_Vm + 16]<br>
+pmaddubsw m4, m5<br>
+<br>
+paddw m0, m4<br>
+<br>
+mova m4, [pw_512]<br>
+<br>
+pmulhrsw m0, m4<br>
+packuswb m0, m0<br>
+movh [r2], m0<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_8x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+FILTER_V4_W8_8x2 8, 2<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+FILTER_V4_W8_8x4 8, 4<br>
+<br>
+RET<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_8x6(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+FILTER_V4_W8_8x6 8, 6<br>
+<br>
+RET<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_4x2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_4x2, 4, 6, 6<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m0, [tab_Cm]<br>
+<br>
+movd m2, [r0]<br>
+movd m3, [r0 + r1]<br>
+lea r5, [r0 + 2 * r1]<br>
+movd m4, [r5]<br>
+movd m5, [r5 + r1]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m1, m4, m5<br>
+punpcklbw m2, m1<br>
+<br>
+pmaddubsw m2, m0<br>
+<br>
+movd m1, [r0 + 4 * r1]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m5, m1<br>
+punpcklbw m3, m5<br>
+<br>
+pmaddubsw m3, m0<br>
+<br>
+phaddw m2, m3<br>
+<br>
+psubw m2, [pw_2000]<br>
+movh [r2], m2<br>
+movhps [r2 + r3], m2<br>
+<br>
+RET<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_4x4, 4, 6, 7<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m0, [tab_Cm]<br>
+<br>
+ lea r4, [r1 * 3]<br>
+ lea r5, [r0 + 4 * r1]<br>
+<br>
+ movd m2, [r0]<br>
+ movd m3, [r0 + r1]<br>
+ movd m4, [r0 + 2 * r1]<br>
+ movd m5, [r0 + r4]<br>
+<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m6, m4, m5<br>
+ punpcklbw m2, m6<br>
+<br>
+ pmaddubsw m2, m0<br>
+<br>
+ movd m6, [r5]<br>
+<br>
+ punpcklbw m3, m4<br>
+ punpcklbw m1, m5, m6<br>
+ punpcklbw m3, m1<br>
+<br>
+ pmaddubsw m3, m0<br>
+<br>
+ phaddw m2, m3<br>
+<br>
+ mova m1, [pw_2000]<br>
+<br>
+ psubw m2, m1<br>
+ movh [r2], m2<br>
+ movhps [r2 + r3], m2<br>
+<br>
+ movd m2, [r5 + r1]<br>
+<br>
+ punpcklbw m4, m5<br>
+ punpcklbw m3, m6, m2<br>
+ punpcklbw m4, m3<br>
+<br>
+ pmaddubsw m4, m0<br>
+<br>
+ movd m3, [r5 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m6<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m5, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+<br>
+ phaddw m4, m5<br>
+<br>
+ psubw m4, m1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movh [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+<br>
+ RET<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W4_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m0, [tab_Cm]<br>
+<br>
+ mova m1, [pw_2000]<br>
+<br>
+ mov r4d, %2/4<br>
+ lea r5, [3 * r1]<br>
+<br>
+.loop:<br>
+ movd m2, [r0]<br>
+ movd m3, [r0 + r1]<br>
+ movd m4, [r0 + 2 * r1]<br>
+ movd m5, [r0 + r5]<br>
+<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m6, m4, m5<br>
+ punpcklbw m2, m6<br>
+<br>
+ pmaddubsw m2, m0<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movd m6, [r0]<br>
+<br>
+ punpcklbw m3, m4<br>
+ punpcklbw m7, m5, m6<br>
+ punpcklbw m3, m7<br>
+<br>
+ pmaddubsw m3, m0<br>
+<br>
+ phaddw m2, m3<br>
+<br>
+ psubw m2, m1<br>
+ movh [r2], m2<br>
+ movhps [r2 + r3], m2<br>
+<br>
+ movd m2, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m5<br>
+ punpcklbw m3, m6, m2<br>
+ punpcklbw m4, m3<br>
+<br>
+ pmaddubsw m4, m0<br>
+<br>
+ movd m3, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m6<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m5, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+<br>
+ phaddw m4, m5<br>
+<br>
+ psubw m4, m1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movh [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W4_H4 4, 8<br>
+FILTER_V_PS_W4_H4 4, 16<br>
+<br>
+FILTER_V_PS_W4_H4 4, 32<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W8_H8_H16_H2 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 7<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m5, [r5 + r4 * 4]<br>
+%else<br>
+ movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m6, m5, [tab_Vm]<br>
+ pshufb m5, [tab_Vm + 16]<br>
+ mova m4, [pw_2000]<br>
+<br>
+ mov r4d, %2/2<br>
+ lea r5, [3 * r1]<br>
+<br>
+.loopH:<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ movq m2, [r0 + 2 * r1]<br>
+ movq m3, [r0 + r5]<br>
+<br>
+ punpcklbw m0, m1<br>
+ punpcklbw m1, m2<br>
+ punpcklbw m2, m3<br>
+<br>
+ pmaddubsw m0, m6<br>
+ pmaddubsw m2, m5<br>
+<br>
+ paddw m0, m2<br>
+<br>
+ psubw m0, m4<br>
+ movu [r2], m0<br>
+<br>
+ movq m0, [r0 + 4 * r1]<br>
+<br>
+ punpcklbw m3, m0<br>
+<br>
+ pmaddubsw m1, m6<br>
+ pmaddubsw m3, m5<br>
+<br>
+ paddw m1, m3<br>
+ psubw m1, m4<br>
+<br>
+ movu [r2 + r3], m1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W8_H8_H16_H2 8, 2<br>
+FILTER_V_PS_W8_H8_H16_H2 8, 4<br>
+FILTER_V_PS_W8_H8_H16_H2 8, 6<br>
+<br>
+FILTER_V_PS_W8_H8_H16_H2 8, 12<br>
+FILTER_V_PS_W8_H8_H16_H2 8, 64<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W8_H8_H16_H32 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m5, [r5 + r4 * 4]<br>
+%else<br>
+ movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m6, m5, [tab_Vm]<br>
+ pshufb m5, [tab_Vm + 16]<br>
+ mova m4, [pw_2000]<br>
+<br>
+ mov r4d, %2/4<br>
+ lea r5, [3 * r1]<br>
+<br>
+.loop:<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ movq m2, [r0 + 2 * r1]<br>
+ movq m3, [r0 + r5]<br>
+<br>
+ punpcklbw m0, m1<br>
+ punpcklbw m1, m2<br>
+ punpcklbw m2, m3<br>
+<br>
+ pmaddubsw m0, m6<br>
+ pmaddubsw m7, m2, m5<br>
+<br>
+ paddw m0, m7<br>
+<br>
+ psubw m0, m4<br>
+ movu [r2], m0<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movq m0, [r0]<br>
+<br>
+ punpcklbw m3, m0<br>
+<br>
+ pmaddubsw m1, m6<br>
+ pmaddubsw m7, m3, m5<br>
+<br>
+ paddw m1, m7<br>
+<br>
+ psubw m1, m4<br>
+ movu [r2 + r3], m1<br>
+<br>
+ movq m1, [r0 + r1]<br>
+<br>
+ punpcklbw m0, m1<br>
+<br>
+ pmaddubsw m2, m6<br>
+ pmaddubsw m0, m5<br>
+<br>
+ paddw m2, m0<br>
+<br>
+ psubw m2, m4<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movu [r2], m2<br>
+<br>
+ movq m2, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m1, m2<br>
+<br>
+ pmaddubsw m3, m6<br>
+ pmaddubsw m1, m5<br>
+<br>
+ paddw m3, m1<br>
+ psubw m3, m4<br>
+<br>
+ movu [r2 + r3], m3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W8_H8_H16_H32 8, 8<br>
+FILTER_V_PS_W8_H8_H16_H32 8, 16<br>
+FILTER_V_PS_W8_H8_H16_H32 8, 32<br>
+<br>
+;------------------------------------------------------------------------------------------------------------<br>
+;void interp_4tap_vert_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W6 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_6x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m5, [r5 + r4 * 4]<br>
+%else<br>
+ movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m6, m5, [tab_Vm]<br>
+ pshufb m5, [tab_Vm + 16]<br>
+ mova m4, [pw_2000]<br>
+ lea r5, [3 * r1]<br>
+ mov r4d, %2/4<br>
+<br>
+.loop:<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ movq m2, [r0 + 2 * r1]<br>
+ movq m3, [r0 + r5]<br>
+<br>
+ punpcklbw m0, m1<br>
+ punpcklbw m1, m2<br>
+ punpcklbw m2, m3<br>
+<br>
+ pmaddubsw m0, m6<br>
+ pmaddubsw m7, m2, m5<br>
+<br>
+ paddw m0, m7<br>
+ psubw m0, m4<br>
+<br>
+ movh [r2], m0<br>
+ pshufd m0, m0, 2<br>
+ movd [r2 + 8], m0<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movq m0, [r0]<br>
+ punpcklbw m3, m0<br>
+<br>
+ pmaddubsw m1, m6<br>
+ pmaddubsw m7, m3, m5<br>
+<br>
+ paddw m1, m7<br>
+ psubw m1, m4<br>
+<br>
+ movh [r2 + r3], m1<br>
+ pshufd m1, m1, 2<br>
+ movd [r2 + r3 + 8], m1<br>
+<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+<br>
+ pmaddubsw m2, m6<br>
+ pmaddubsw m0, m5<br>
+<br>
+ paddw m2, m0<br>
+ psubw m2, m4<br>
+<br>
+ lea r2,[r2 + 2 * r3]<br>
+ movh [r2], m2<br>
+ pshufd m2, m2, 2<br>
+ movd [r2 + 8], m2<br>
+<br>
+ movq m2,[r0 + 2 * r1]<br>
+ punpcklbw m1, m2<br>
+<br>
+ pmaddubsw m3, m6<br>
+ pmaddubsw m1, m5<br>
+<br>
+ paddw m3, m1<br>
+ psubw m3, m4<br>
+<br>
+ movh [r2 + r3], m3<br>
+ pshufd m3, m3, 2<br>
+ movd [r2 + r3 + 8], m3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W6 6, 8<br>
+FILTER_V_PS_W6 6, 16<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_12x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W12 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_12x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m1, m0, [tab_Vm]<br>
+ pshufb m0, [tab_Vm + 16]<br>
+<br>
+ mov r4d, %2/2<br>
+<br>
+.loop:<br>
+ movu m2, [r0]<br>
+ movu m3, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movu m5, [r0]<br>
+ movu m7, [r0 + r1]<br>
+<br>
+ punpcklbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m4, m6<br>
+<br>
+ punpckhbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m2, m6<br>
+<br>
+ mova m6, [pw_2000]<br>
+<br>
+ psubw m4, m6<br>
+ psubw m2, m6<br>
+<br>
+ movu [r2], m4<br>
+ movh [r2 + 16], m2<br>
+<br>
+ punpcklbw m4, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m3, m1<br>
+<br>
+ movu m2, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m7, m2<br>
+ punpckhbw m7, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+ pmaddubsw m7, m0<br>
+<br>
+ paddw m4, m5<br>
+ paddw m3, m7<br>
+<br>
+ psubw m4, m6<br>
+ psubw m3, m6<br>
+<br>
+ movu [r2 + r3], m4<br>
+ movh [r2 + r3 + 16], m3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W12 12, 16<br>
+FILTER_V_PS_W12 12, 32<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_16x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W16 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m1, m0, [tab_Vm]<br>
+ pshufb m0, [tab_Vm + 16]<br>
+ mov r4d, %2/2<br>
+<br>
+.loop:<br>
+ movu m2, [r0]<br>
+ movu m3, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movu m5, [r0]<br>
+ movu m7, [r0 + r1]<br>
+<br>
+ punpcklbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m4, m6<br>
+<br>
+ punpckhbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m2, m6<br>
+<br>
+ mova m6, [pw_2000]<br>
+<br>
+ psubw m4, m6<br>
+ psubw m2, m6<br>
+<br>
+ movu [r2], m4<br>
+ movu [r2 + 16], m2<br>
+<br>
+ punpcklbw m4, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m3, m1<br>
+<br>
+ movu m5, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m2, m7, m5<br>
+ punpckhbw m7, m5<br>
+<br>
+ pmaddubsw m2, m0<br>
+ pmaddubsw m7, m0<br>
+<br>
+ paddw m4, m2<br>
+ paddw m3, m7<br>
+<br>
+ psubw m4, m6<br>
+ psubw m3, m6<br>
+<br>
+ movu [r2 + r3], m4<br>
+ movu [r2 + r3 + 16], m3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W16 16, 4<br>
+FILTER_V_PS_W16 16, 8<br>
+FILTER_V_PS_W16 16, 12<br>
+FILTER_V_PS_W16 16, 16<br>
+FILTER_V_PS_W16 16, 32<br>
+<br>
+FILTER_V_PS_W16 16, 24<br>
+FILTER_V_PS_W16 16, 64<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+;void interp_4tap_vert_ps_24x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V4_PS_W24 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_24x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m1, m0, [tab_Vm]<br>
+ pshufb m0, [tab_Vm + 16]<br>
+<br>
+ mov r4d, %2/2<br>
+<br>
+.loop:<br>
+ movu m2, [r0]<br>
+ movu m3, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ lea r5, [r0 + 2 * r1]<br>
+<br>
+ movu m5, [r5]<br>
+ movu m7, [r5 + r1]<br>
+<br>
+ punpcklbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m4, m6<br>
+<br>
+ punpckhbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m2, m6<br>
+<br>
+ mova m6, [pw_2000]<br>
+<br>
+ psubw m4, m6<br>
+ psubw m2, m6<br>
+<br>
+ movu [r2], m4<br>
+ movu [r2 + 16], m2<br>
+<br>
+ punpcklbw m4, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m3, m1<br>
+<br>
+ movu m2, [r5 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m7, m2<br>
+ punpckhbw m7, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+ pmaddubsw m7, m0<br>
+<br>
+ paddw m4, m5<br>
+ paddw m3, m7<br>
+<br>
+ psubw m4, m6<br>
+ psubw m3, m6<br>
+<br>
+ movu [r2 + r3], m4<br>
+ movu [r2 + r3 + 16], m3<br>
+<br>
+ movq m2, [r0 + 16]<br>
+ movq m3, [r0 + r1 + 16]<br>
+ movq m4, [r5 + 16]<br>
+ movq m5, [r5 + r1 + 16]<br>
+<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m7, m4, m5<br>
+<br>
+ pmaddubsw m2, m1<br>
+ pmaddubsw m7, m0<br>
+<br>
+ paddw m2, m7<br>
+ psubw m2, m6<br>
+<br>
+ movu [r2 + 32], m2<br>
+<br>
+ movq m2, [r5 + 2 * r1 + 16]<br>
+<br>
+ punpcklbw m3, m4<br>
+ punpcklbw m5, m2<br>
+<br>
+ pmaddubsw m3, m1<br>
+ pmaddubsw m5, m0<br>
+<br>
+ paddw m3, m5<br>
+ psubw m3, m6<br>
+<br>
+ movu [r2 + r3 + 32], m3<br>
+<br>
+ mov r0, r5<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_PS_W24 24, 32<br>
+<br>
+FILTER_V4_PS_W24 24, 64<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_32x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W32 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m1, m0, [tab_Vm]<br>
+ pshufb m0, [tab_Vm + 16]<br>
+<br>
+ mova m7, [pw_2000]<br>
+<br>
+ mov r4d, %2<br>
+<br>
+.loop:<br>
+ movu m2, [r0]<br>
+ movu m3, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ lea r5, [r0 + 2 * r1]<br>
+ movu m3, [r5]<br>
+ movu m5, [r5 + r1]<br>
+<br>
+ punpcklbw m6, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m6, m0<br>
+ pmaddubsw m3, m0<br>
+<br>
+ paddw m4, m6<br>
+ paddw m2, m3<br>
+<br>
+ psubw m4, m7<br>
+ psubw m2, m7<br>
+<br>
+ movu [r2], m4<br>
+ movu [r2 + 16], m2<br>
+<br>
+ movu m2, [r0 + 16]<br>
+ movu m3, [r0 + r1 + 16]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ movu m3, [r5 + 16]<br>
+ movu m5, [r5 + r1 + 16]<br>
+<br>
+ punpcklbw m6, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m6, m0<br>
+ pmaddubsw m3, m0<br>
+<br>
+ paddw m4, m6<br>
+ paddw m2, m3<br>
+<br>
+ psubw m4, m7<br>
+ psubw m2, m7<br>
+<br>
+ movu [r2 + 32], m4<br>
+ movu [r2 + 48], m2<br>
+<br>
+ lea r0, [r0 + r1]<br>
+ lea r2, [r2 + r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W32 32, 8<br>
+FILTER_V_PS_W32 32, 16<br>
+FILTER_V_PS_W32 32, 24<br>
+FILTER_V_PS_W32 32, 32<br>
+<br>
+FILTER_V_PS_W32 32, 48<br>
+FILTER_V_PS_W32 32, 64<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W8_H8_H16_H32 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m5, [r5 + r4 * 4]<br>
+%else<br>
+movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m6, m5, [tab_Vm]<br>
+pshufb m5, [tab_Vm + 16]<br>
+mova m4, [pw_512]<br>
+lea r5, [r1 * 3]<br>
+<br>
+mov r4d, %2<br>
+<br>
+.loop:<br>
+movq m0, [r0]<br>
+movq m1, [r0 + r1]<br>
+movq m2, [r0 + 2 * r1]<br>
+movq m3, [r0 + r5]<br>
+<br>
+punpcklbw m0, m1<br>
+punpcklbw m1, m2<br>
+punpcklbw m2, m3<br>
+<br>
+pmaddubsw m0, m6<br>
+pmaddubsw m7, m2, m5<br>
+<br>
+paddw m0, m7<br>
+<br>
+pmulhrsw m0, m4<br>
+packuswb m0, m0<br>
+movh [r2], m0<br>
+<br>
+lea r0, [r0 + 4 * r1]<br>
+movq m0, [r0]<br>
+<br>
+punpcklbw m3, m0<br>
+<br>
+pmaddubsw m1, m6<br>
+pmaddubsw m7, m3, m5<br>
+<br>
+paddw m1, m7<br>
+<br>
+pmulhrsw m1, m4<br>
+packuswb m1, m1<br>
+movh [r2 + r3], m1<br>
+<br>
+movq m1, [r0 + r1]<br>
+<br>
+punpcklbw m0, m1<br>
+<br>
+pmaddubsw m2, m6<br>
+pmaddubsw m0, m5<br>
+<br>
+paddw m2, m0<br>
+<br>
+pmulhrsw m2, m4<br>
+<br>
+movq m7, [r0 + 2 * r1]<br>
+punpcklbw m1, m7<br>
+<br>
+pmaddubsw m3, m6<br>
+pmaddubsw m1, m5<br>
+<br>
+paddw m3, m1<br>
+<br>
+pmulhrsw m3, m4<br>
+packuswb m2, m3<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+movh [r2], m2<br>
+movhps [r2 + r3], m2<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 4<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W8_H8_H16_H32 8, 8<br>
+FILTER_V4_W8_H8_H16_H32 8, 16<br>
+FILTER_V4_W8_H8_H16_H32 8, 32<br>
+<br>
+FILTER_V4_W8_H8_H16_H32 8, 12<br>
+FILTER_V4_W8_H8_H16_H32 8, 64<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W6_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_6x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m5, [r5 + r4 * 4]<br>
+%else<br>
+movd m5, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m6, m5, [tab_Vm]<br>
+pshufb m5, [tab_Vm + 16]<br>
+mova m4, [pw_512]<br>
+<br>
+mov r4d, %2<br>
+lea r5, [3 * r1]<br>
+<br>
+.loop:<br>
+movq m0, [r0]<br>
+movq m1, [r0 + r1]<br>
+movq m2, [r0 + 2 * r1]<br>
+movq m3, [r0 + r5]<br>
+<br>
+punpcklbw m0, m1<br>
+punpcklbw m1, m2<br>
+punpcklbw m2, m3<br>
+<br>
+pmaddubsw m0, m6<br>
+pmaddubsw m7, m2, m5<br>
+<br>
+paddw m0, m7<br>
+<br>
+pmulhrsw m0, m4<br>
+packuswb m0, m0<br>
+movd [r2], m0<br>
+pextrw [r2 + 4], m0, 2<br>
+<br>
+lea r0, [r0 + 4 * r1]<br>
+<br>
+movq m0, [r0]<br>
+punpcklbw m3, m0<br>
+<br>
+pmaddubsw m1, m6<br>
+pmaddubsw m7, m3, m5<br>
+<br>
+paddw m1, m7<br>
+<br>
+pmulhrsw m1, m4<br>
+packuswb m1, m1<br>
+movd [r2 + r3], m1<br>
+pextrw [r2 + r3 + 4], m1, 2<br>
+<br>
+movq m1, [r0 + r1]<br>
+punpcklbw m7, m0, m1<br>
+<br>
+pmaddubsw m2, m6<br>
+pmaddubsw m7, m5<br>
+<br>
+paddw m2, m7<br>
+<br>
+pmulhrsw m2, m4<br>
+packuswb m2, m2<br>
+lea r2, [r2 + 2 * r3]<br>
+movd [r2], m2<br>
+pextrw [r2 + 4], m2, 2<br>
+<br>
+movq m2, [r0 + 2 * r1]<br>
+punpcklbw m1, m2<br>
+<br>
+pmaddubsw m3, m6<br>
+pmaddubsw m1, m5<br>
+<br>
+paddw m3, m1<br>
+<br>
+pmulhrsw m3, m4<br>
+packuswb m3, m3<br>
+<br>
+movd [r2 + r3], m3<br>
+pextrw [r2 + r3 + 4], m3, 2<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 4<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W6_H4 6, 8<br>
+<br>
+FILTER_V4_W6_H4 6, 16<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W12_H2 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_12x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m1, m0, [tab_Vm]<br>
+pshufb m0, [tab_Vm + 16]<br>
+<br>
+mov r4d, %2<br>
+<br>
+.loop:<br>
+movu m2, [r0]<br>
+movu m3, [r0 + r1]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+lea r0, [r0 + 2 * r1]<br>
+movu m5, [r0]<br>
+movu m7, [r0 + r1]<br>
+<br>
+punpcklbw m6, m5, m7<br>
+pmaddubsw m6, m0<br>
+paddw m4, m6<br>
+<br>
+punpckhbw m6, m5, m7<br>
+pmaddubsw m6, m0<br>
+paddw m2, m6<br>
+<br>
+mova m6, [pw_512]<br>
+<br>
+pmulhrsw m4, m6<br>
+pmulhrsw m2, m6<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movh [r2], m4<br>
+pextrd [r2 + 8], m4, 2<br>
+<br>
+punpcklbw m4, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m3, m1<br>
+<br>
+movu m5, [r0 + 2 * r1]<br>
+<br>
+punpcklbw m2, m7, m5<br>
+punpckhbw m7, m5<br>
+<br>
+pmaddubsw m2, m0<br>
+pmaddubsw m7, m0<br>
+<br>
+paddw m4, m2<br>
+paddw m3, m7<br>
+<br>
+pmulhrsw m4, m6<br>
+pmulhrsw m3, m6<br>
+<br>
+packuswb m4, m3<br>
+<br>
+movh [r2 + r3], m4<br>
+pextrd [r2 + r3 + 8], m4, 2<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 2<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W12_H2 12, 16<br>
+<br>
+FILTER_V4_W12_H2 12, 32<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W16_H2 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_16x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m1, m0, [tab_Vm]<br>
+pshufb m0, [tab_Vm + 16]<br>
+<br>
+mov r4d, %2/2<br>
+<br>
+.loop:<br>
+movu m2, [r0]<br>
+movu m3, [r0 + r1]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+lea r0, [r0 + 2 * r1]<br>
+movu m5, [r0]<br>
+movu m6, [r0 + r1]<br>
+<br>
+punpckhbw m7, m5, m6<br>
+pmaddubsw m7, m0<br>
+paddw m2, m7<br>
+<br>
+punpcklbw m7, m5, m6<br>
+pmaddubsw m7, m0<br>
+paddw m4, m7<br>
+<br>
+mova m7, [pw_512]<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m2, m7<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movu [r2], m4<br>
+<br>
+punpcklbw m4, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m3, m1<br>
+<br>
+movu m5, [r0 + 2 * r1]<br>
+<br>
+punpcklbw m2, m6, m5<br>
+punpckhbw m6, m5<br>
+<br>
+pmaddubsw m2, m0<br>
+pmaddubsw m6, m0<br>
+<br>
+paddw m4, m2<br>
+paddw m3, m6<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m3, m7<br>
+<br>
+packuswb m4, m3<br>
+<br>
+movu [r2 + r3], m4<br>
+<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+dec r4d<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W16_H2 16, 4<br>
+FILTER_V4_W16_H2 16, 8<br>
+FILTER_V4_W16_H2 16, 12<br>
+FILTER_V4_W16_H2 16, 16<br>
+FILTER_V4_W16_H2 16, 32<br>
+<br>
+FILTER_V4_W16_H2 16, 24<br>
+FILTER_V4_W16_H2 16, 64<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+;void interp_4tap_vert_pp_24x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W24 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_24x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m1, m0, [tab_Vm]<br>
+pshufb m0, [tab_Vm + 16]<br>
+<br>
+mov r4d, %2<br>
+<br>
+.loop:<br>
+movu m2, [r0]<br>
+movu m3, [r0 + r1]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+lea r5, [r0 + 2 * r1]<br>
+movu m5, [r5]<br>
+movu m7, [r5 + r1]<br>
+<br>
+punpcklbw m6, m5, m7<br>
+pmaddubsw m6, m0<br>
+paddw m4, m6<br>
+<br>
+punpckhbw m6, m5, m7<br>
+pmaddubsw m6, m0<br>
+paddw m2, m6<br>
+<br>
+mova m6, [pw_512]<br>
+<br>
+pmulhrsw m4, m6<br>
+pmulhrsw m2, m6<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movu [r2], m4<br>
+<br>
+punpcklbw m4, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m3, m1<br>
+<br>
+movu m2, [r5 + 2 * r1]<br>
+<br>
+punpcklbw m5, m7, m2<br>
+punpckhbw m7, m2<br>
+<br>
+pmaddubsw m5, m0<br>
+pmaddubsw m7, m0<br>
+<br>
+paddw m4, m5<br>
+paddw m3, m7<br>
+<br>
+pmulhrsw m4, m6<br>
+pmulhrsw m3, m6<br>
+<br>
+packuswb m4, m3<br>
+<br>
+movu [r2 + r3], m4<br>
+<br>
+movq m2, [r0 + 16]<br>
+movq m3, [r0 + r1 + 16]<br>
+movq m4, [r5 + 16]<br>
+movq m5, [r5 + r1 + 16]<br>
+<br>
+punpcklbw m2, m3<br>
+punpcklbw m4, m5<br>
+<br>
+pmaddubsw m2, m1<br>
+pmaddubsw m4, m0<br>
+<br>
+paddw m2, m4<br>
+<br>
+pmulhrsw m2, m6<br>
+<br>
+movq m3, [r0 + r1 + 16]<br>
+movq m4, [r5 + 16]<br>
+movq m5, [r5 + r1 + 16]<br>
+movq m7, [r5 + 2 * r1 + 16]<br>
+<br>
+punpcklbw m3, m4<br>
+punpcklbw m5, m7<br>
+<br>
+pmaddubsw m3, m1<br>
+pmaddubsw m5, m0<br>
+<br>
+paddw m3, m5<br>
+<br>
+pmulhrsw m3, m6<br>
+packuswb m2, m3<br>
+<br>
+movh [r2 + 16], m2<br>
+movhps [r2 + r3 + 16], m2<br>
+<br>
+mov r0, r5<br>
+lea r2, [r2 + 2 * r3]<br>
+<br>
+sub r4, 2<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W24 24, 32<br>
+<br>
+FILTER_V4_W24 24, 64<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W32 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_%1x%2, 4, 6, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m1, m0, [tab_Vm]<br>
+pshufb m0, [tab_Vm + 16]<br>
+<br>
+mova m7, [pw_512]<br>
+<br>
+mov r4d, %2<br>
+<br>
+.loop:<br>
+movu m2, [r0]<br>
+movu m3, [r0 + r1]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+lea r5, [r0 + 2 * r1]<br>
+movu m3, [r5]<br>
+movu m5, [r5 + r1]<br>
+<br>
+punpcklbw m6, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m6, m0<br>
+pmaddubsw m3, m0<br>
+<br>
+paddw m4, m6<br>
+paddw m2, m3<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m2, m7<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movu [r2], m4<br>
+<br>
+movu m2, [r0 + 16]<br>
+movu m3, [r0 + r1 + 16]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+movu m3, [r5 + 16]<br>
+movu m5, [r5 + r1 + 16]<br>
+<br>
+punpcklbw m6, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m6, m0<br>
+pmaddubsw m3, m0<br>
+<br>
+paddw m4, m6<br>
+paddw m2, m3<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m2, m7<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movu [r2 + 16], m4<br>
+<br>
+lea r0, [r0 + r1]<br>
+lea r2, [r2 + r3]<br>
+<br>
+dec r4<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W32 32, 8<br>
+FILTER_V4_W32 32, 16<br>
+FILTER_V4_W32 32, 24<br>
+FILTER_V4_W32 32, 32<br>
+<br>
+FILTER_V4_W32 32, 48<br>
+FILTER_V4_W32 32, 64<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------<br>
+%macro FILTER_V4_W16n_H2 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 8<br>
+<br>
+mov r4d, r4m<br>
+sub r0, r1<br>
+<br>
+%ifdef PIC<br>
+lea r5, [tab_ChromaCoeff]<br>
+movd m0, [r5 + r4 * 4]<br>
+%else<br>
+movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+pshufb m1, m0, [tab_Vm]<br>
+pshufb m0, [tab_Vm + 16]<br>
+<br>
+mov r4d, %2/2<br>
+<br>
+.loop:<br>
+<br>
+mov r6d, %1/16<br>
+<br>
+.loopW:<br>
+<br>
+movu m2, [r0]<br>
+movu m3, [r0 + r1]<br>
+<br>
+punpcklbw m4, m2, m3<br>
+punpckhbw m2, m3<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m2, m1<br>
+<br>
+lea r5, [r0 + 2 * r1]<br>
+movu m5, [r5]<br>
+movu m6, [r5 + r1]<br>
+<br>
+punpckhbw m7, m5, m6<br>
+pmaddubsw m7, m0<br>
+paddw m2, m7<br>
+<br>
+punpcklbw m7, m5, m6<br>
+pmaddubsw m7, m0<br>
+paddw m4, m7<br>
+<br>
+mova m7, [pw_512]<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m2, m7<br>
+<br>
+packuswb m4, m2<br>
+<br>
+movu [r2], m4<br>
+<br>
+punpcklbw m4, m3, m5<br>
+punpckhbw m3, m5<br>
+<br>
+pmaddubsw m4, m1<br>
+pmaddubsw m3, m1<br>
+<br>
+movu m5, [r5 + 2 * r1]<br>
+<br>
+punpcklbw m2, m6, m5<br>
+punpckhbw m6, m5<br>
+<br>
+pmaddubsw m2, m0<br>
+pmaddubsw m6, m0<br>
+<br>
+paddw m4, m2<br>
+paddw m3, m6<br>
+<br>
+pmulhrsw m4, m7<br>
+pmulhrsw m3, m7<br>
+<br>
+packuswb m4, m3<br>
+<br>
+movu [r2 + r3], m4<br>
+<br>
+add r0, 16<br>
+add r2, 16<br>
+dec r6d<br>
+jnz .loopW<br>
+<br>
+lea r0, [r0 + r1 * 2 - %1]<br>
+lea r2, [r2 + r3 * 2 - %1]<br>
+<br>
+dec r4d<br>
+jnz .loop<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V4_W16n_H2 64, 64<br>
+FILTER_V4_W16n_H2 64, 32<br>
+FILTER_V4_W16n_H2 64, 48<br>
+FILTER_V4_W16n_H2 48, 64<br>
+FILTER_V4_W16n_H2 64, 16<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------<br>
+; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)<br>
+;-----------------------------------------------------------------------------<br>
+INIT_XMM ssse3<br>
+cglobal luma_p2s, 3, 7, 6<br>
+<br>
+ ; load width and height<br>
+ mov r3d, r3m<br>
+ mov r4d, r4m<br>
+<br>
+ ; load constant<br>
+ mova m4, [pb_128]<br>
+ mova m5, [tab_c_64_n64]<br>
+<br>
+.loopH:<br>
+<br>
+ xor r5d, r5d<br>
+.loopW:<br>
+ lea r6, [r0 + r5]<br>
+<br>
+ movh m0, [r6]<br>
+ punpcklbw m0, m4<br>
+ pmaddubsw m0, m5<br>
+<br>
+ movh m1, [r6 + r1]<br>
+ punpcklbw m1, m4<br>
+ pmaddubsw m1, m5<br>
+<br>
+ movh m2, [r6 + r1 * 2]<br>
+ punpcklbw m2, m4<br>
+ pmaddubsw m2, m5<br>
+<br>
+ lea r6, [r6 + r1 * 2]<br>
+ movh m3, [r6 + r1]<br>
+ punpcklbw m3, m4<br>
+ pmaddubsw m3, m5<br>
+<br>
+ add r5, 8<br>
+ cmp r5, r3<br>
+ jg .width4<br>
+ movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0<br>
+ movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1<br>
+ movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2<br>
+ movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3<br>
+ je .nextH<br>
+ jmp .loopW<br>
+<br>
+.width4:<br>
+ movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0<br>
+ movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1<br>
+ movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2<br>
+ movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3<br>
+<br>
+.nextH:<br>
+ lea r0, [r0 + r1 * 4]<br>
+ add r2, FENC_STRIDE * 8<br>
+<br>
+ sub r4d, 4<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+<br>
+%macro PROCESS_LUMA_W4_4R 0<br>
+ movd m0, [r0]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[0 1]<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m0, [r0]<br>
+ punpcklbw m1, m0 ; m1=[1 2]<br>
+ punpcklqdq m2, m1 ; m2=[0 1 1 2]<br>
+ pmaddubsw m4, m2, [r6 + 0 * 16] ; m4=[0+1 1+2]<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m5, m0, m1 ; m2=[2 3]<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m0, [r0]<br>
+ punpcklbw m1, m0 ; m1=[3 4]<br>
+ punpcklqdq m5, m1 ; m5=[2 3 3 4]<br>
+ pmaddubsw m2, m5, [r6 + 1 * 16] ; m2=[2+3 3+4]<br>
+ paddw m4, m2 ; m4=[0+1+2+3 1+2+3+4] Row1-2<br>
+ pmaddubsw m5, [r6 + 0 * 16] ; m5=[2+3 3+4] Row3-4<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[4 5]<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m0, [r0]<br>
+ punpcklbw m1, m0 ; m1=[5 6]<br>
+ punpcklqdq m2, m1 ; m2=[4 5 5 6]<br>
+ pmaddubsw m1, m2, [r6 + 2 * 16] ; m1=[4+5 5+6]<br>
+ paddw m4, m1 ; m4=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2<br>
+ pmaddubsw m2, [r6 + 1 * 16] ; m2=[4+5 5+6]<br>
+ paddw m5, m2 ; m5=[2+3+4+5 3+4+5+6] Row3-4<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[6 7]<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m0, [r0]<br>
+ punpcklbw m1, m0 ; m1=[7 8]<br>
+ punpcklqdq m2, m1 ; m2=[6 7 7 8]<br>
+ pmaddubsw m1, m2, [r6 + 3 * 16] ; m1=[6+7 7+8]<br>
+ paddw m4, m1 ; m4=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end<br>
+ pmaddubsw m2, [r6 + 2 * 16] ; m2=[6+7 7+8]<br>
+ paddw m5, m2 ; m5=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklbw m2, m0, m1 ; m2=[8 9]<br>
+ movd m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0 ; m1=[9 10]<br>
+ punpcklqdq m2, m1 ; m2=[8 9 9 10]<br>
+ pmaddubsw m2, [r6 + 3 * 16] ; m2=[8+9 9+10]<br>
+ paddw m5, m2 ; m5=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end<br>
+%endmacro<br>
+<br>
+%macro PROCESS_LUMA_W8_4R 0<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m7, m0, [r6 + 0 *16] ;m7=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m0, [r0]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m6, m1, [r6 + 0 *16] ;m6=[1+2] Row2<br>
+<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m5, m0, [r6 + 0 *16] ;m5=[2+3] Row3<br>
+ pmaddubsw m0, [r6 + 1 * 16]<br>
+ paddw m7, m0 ;m7=[0+1+2+3] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m0, [r0]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m4, m1, [r6 + 0 *16] ;m4=[3+4] Row4<br>
+ pmaddubsw m1, [r6 + 1 * 16]<br>
+ paddw m6, m1 ;m6 = [1+2+3+4] Row2<br>
+<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m2, m0, [r6 + 1 * 16]<br>
+ pmaddubsw m0, [r6 + 2 * 16]<br>
+ paddw m7, m0 ;m7=[0+1+2+3+4+5] Row1<br>
+ paddw m5, m2 ;m5=[2+3+4+5] Row3<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m0, [r0]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m2, m1, [r6 + 1 * 16]<br>
+ pmaddubsw m1, [r6 + 2 * 16]<br>
+ paddw m6, m1 ;m6=[1+2+3+4+5+6] Row2<br>
+ paddw m4, m2 ;m4=[3+4+5+6] Row4<br>
+<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m2, m0, [r6 + 2 * 16]<br>
+ pmaddubsw m0, [r6 + 3 * 16]<br>
+ paddw m7, m0 ;m7=[0+1+2+3+4+5+6+7] Row1 end<br>
+ paddw m5, m2 ;m5=[2+3+4+5+6+7] Row3<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m0, [r0]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m2, m1, [r6 + 2 * 16]<br>
+ pmaddubsw m1, [r6 + 3 * 16]<br>
+ paddw m6, m1 ;m6=[1+2+3+4+5+6+7+8] Row2 end<br>
+ paddw m4, m2 ;m4=[3+4+5+6+7+8] Row4<br>
+<br>
+ movq m1, [r0 + r1]<br>
+ punpcklbw m0, m1<br>
+ pmaddubsw m0, [r6 + 3 * 16]<br>
+ paddw m5, m0 ;m5=[2+3+4+5+6+7+8+9] Row3 end<br>
+<br>
+ movq m0, [r0 + 2 * r1]<br>
+ punpcklbw m1, m0<br>
+ pmaddubsw m1, [r6 + 3 * 16]<br>
+ paddw m4, m1 ;m4=[3+4+5+6+7+8+9+10] Row4 end<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_4xN 3<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6<br>
+ lea r5, [3 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+%ifidn %3,ps<br>
+ add r3d, r3d<br>
+%endif<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVer]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVer + r4]<br>
+%endif<br>
+<br>
+%ifidn %3,pp<br>
+ mova m3, [pw_512]<br>
+%else<br>
+ mova m3, [pw_2000]<br>
+%endif<br>
+<br>
+ mov r4d, %2/4<br>
+ lea r5, [4 * r1]<br>
+<br>
+.loopH:<br>
+ PROCESS_LUMA_W4_4R<br>
+<br>
+%ifidn %3,pp<br>
+ pmulhrsw m4, m3<br>
+ pmulhrsw m5, m3<br>
+<br>
+ packuswb m4, m5<br>
+<br>
+ movd [r2], m4<br>
+ pextrd [r2 + r3], m4, 1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ pextrd [r2], m4, 2<br>
+ pextrd [r2 + r3], m4, 3<br>
+%else<br>
+ psubw m4, m3<br>
+ psubw m5, m3<br>
+<br>
+ movlps [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movlps [r2], m5<br>
+ movhps [r2 + r3], m5<br>
+%endif<br>
+<br>
+ sub r0, r5<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+<br>
+INIT_YMM avx2<br>
+cglobal interp_8tap_vert_pp_4x4, 4,6,8<br>
+ mov r4d, r4m<br>
+ lea r5, [r1 * 3]<br>
+ sub r0, r5<br>
+<br>
+ ; TODO: VPGATHERDD<br>
+ movd xm1, [r0] ; m1 = row0<br>
+ movd xm2, [r0 + r1] ; m2 = row1<br>
+ punpcklbw xm1, xm2 ; m1 = [13 03 12 02 11 01 10 00]<br>
+<br>
+ movd xm3, [r0 + r1 * 2] ; m3 = row2<br>
+ punpcklbw xm2, xm3 ; m2 = [23 13 22 12 21 11 20 10]<br>
+ movd xm4, [r0 + r5]<br>
+ punpcklbw xm3, xm4 ; m3 = [33 23 32 22 31 21 30 20]<br>
+ punpcklwd xm1, xm3 ; m1 = [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]<br>
+<br>
+ lea r0, [r0 + r1 * 4]<br>
+ movd xm5, [r0] ; m5 = row4<br>
+ punpcklbw xm4, xm5 ; m4 = [43 33 42 32 41 31 40 30]<br>
+ punpcklwd xm2, xm4 ; m2 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10]<br>
+ vinserti128 m1, m1, xm2, 1 ; m1 = [43 33 21 13 42 32 22 12 41 31 21 11 40 30 20 10] - [33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00]<br>
+ movd xm2, [r0 + r1] ; m2 = row5<br>
+ punpcklbw xm5, xm2 ; m5 = [53 43 52 42 51 41 50 40]<br>
+ punpcklwd xm3, xm5 ; m3 = [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]<br>
+ movd xm6, [r0 + r1 * 2] ; m6 = row6<br>
+ punpcklbw xm2, xm6 ; m2 = [63 53 62 52 61 51 60 50]<br>
+ punpcklwd xm4, xm2 ; m4 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30]<br>
+ vinserti128 m3, m3, xm4, 1 ; m3 = [63 53 43 33 62 52 42 32 61 51 41 31 60 50 40 30] - [53 43 44 23 52 42 32 22 51 41 31 21 50 40 30 20]<br>
+ movd xm4, [r0 + r5] ; m4 = row7<br>
+ punpcklbw xm6, xm4 ; m6 = [73 63 72 62 71 61 70 60]<br>
+ punpcklwd xm5, xm6 ; m5 = [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]<br>
+<br>
+ lea r0, [r0 + r1 * 4]<br>
+ movd xm7, [r0] ; m7 = row8<br>
+ punpcklbw xm4, xm7 ; m4 = [83 73 82 72 81 71 80 70]<br>
+ punpcklwd xm2, xm4 ; m2 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50]<br>
+ vinserti128 m5, m5, xm2, 1 ; m5 = [83 73 63 53 82 72 62 52 81 71 61 51 80 70 60 50] - [73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40]<br>
+ movd xm2, [r0 + r1] ; m2 = row9<br>
+ punpcklbw xm7, xm2 ; m7 = [93 83 92 82 91 81 90 80]<br>
+ punpcklwd xm6, xm7 ; m6 = [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]<br>
+ movd xm7, [r0 + r1 * 2] ; m7 = rowA<br>
+ punpcklbw xm2, xm7 ; m2 = [A3 93 A2 92 A1 91 A0 90]<br>
+ punpcklwd xm4, xm2 ; m4 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70]<br>
+ vinserti128 m6, m6, xm4, 1 ; m6 = [A3 93 83 73 A2 92 82 72 A1 91 81 71 A0 90 80 70] - [93 83 73 63 92 82 72 62 91 81 71 61 90 80 70 60]<br>
+<br>
+ ; load filter coeff<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeff]<br>
+ vpbroadcastd m0, [r5 + r4 * 8 + 0]<br>
+ vpbroadcastd m2, [r5 + r4 * 8 + 4]<br>
+%else<br>
+ vpbroadcastd m0, [tab_LumaCoeff + r4 * 8 + 0]<br>
+ vpbroadcastd m2, [tab_LumaCoeff + r4 * 8 + 4]<br>
+%endif<br>
+<br>
+ pmaddubsw m1, m0<br>
+ pmaddubsw m3, m0<br>
+ pmaddubsw m5, m2<br>
+ pmaddubsw m6, m2<br>
+ vbroadcasti128 m0, [pw_1]<br>
+ pmaddwd m1, m0<br>
+ pmaddwd m3, m0<br>
+ pmaddwd m5, m0<br>
+ pmaddwd m6, m0<br>
+ paddd m1, m5 ; m1 = DQWORD ROW[1 0]<br>
+ paddd m3, m6 ; m3 = DQWORD ROW[3 2]<br>
+ packssdw m1, m3 ; m1 = QWORD ROW[3 1 2 0]<br>
+<br>
+ ; TODO: does it overflow?<br>
+ pmulhrsw m1, [pw_512]<br>
+ vextracti128 xm2, m1, 1<br>
+ packuswb xm1, xm2 ; m1 = DWORD ROW[3 1 2 0]<br>
+ movd [r2], xm1<br>
+ pextrd [r2 + r3], xm1, 2<br>
+ pextrd [r2 + r3 * 2], xm1, 1<br>
+ lea r4, [r3 * 3]<br>
+ pextrd [r2 + r4], xm1, 3<br>
+ RET<br>
+<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 4, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 8, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 16, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 4, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 8, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_4xN 4, 16, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_8xN 3<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8<br>
+ lea r5, [3 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifidn %3,ps<br>
+ add r3d, r3d<br>
+%endif<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVer]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVer + r4]<br>
+%endif<br>
+<br>
+ %ifidn %3,pp<br>
+ mova m3, [pw_512]<br>
+%else<br>
+ mova m3, [pw_2000]<br>
+%endif<br>
+<br>
+ mov r4d, %2/4<br>
+ lea r5, [4 * r1]<br>
+<br>
+.loopH:<br>
+ PROCESS_LUMA_W8_4R<br>
+<br>
+%ifidn %3,pp<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movlps [r2], m5<br>
+ movhps [r2 + r3], m5<br>
+%else<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movu [r2], m5<br>
+ movu [r2 + r3], m4<br>
+%endif<br>
+<br>
+ sub r0, r5<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 4, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 8, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 16, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 32, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 4, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 8, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 16, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_8xN 8, 32, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_12xN 3<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8<br>
+ lea r5, [3 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+%ifidn %3,ps<br>
+ add r3d, r3d<br>
+%endif<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVer]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVer + r4]<br>
+%endif<br>
+<br>
+ %ifidn %3,pp<br>
+ mova m3, [pw_512]<br>
+%else<br>
+ mova m3, [pw_2000]<br>
+%endif<br>
+<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH:<br>
+ PROCESS_LUMA_W8_4R<br>
+<br>
+%ifidn %3,pp<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movlps [r5], m5<br>
+ movhps [r5 + r3], m5<br>
+%else<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movu [r5], m5<br>
+ movu [r5 + r3], m4<br>
+%endif<br>
+<br>
+ lea r5, [8 * r1 - 8]<br>
+ sub r0, r5<br>
+%ifidn %3,pp<br>
+ add r2, 8<br>
+%else<br>
+ add r2, 16<br>
+%endif<br>
+<br>
+ PROCESS_LUMA_W4_4R<br>
+<br>
+%ifidn %3,pp<br>
+ pmulhrsw m4, m3<br>
+ pmulhrsw m5, m3<br>
+<br>
+ packuswb m4, m5<br>
+<br>
+ movd [r2], m4<br>
+ pextrd [r2 + r3], m4, 1<br>
+ lea r5, [r2 + 2 * r3]<br>
+ pextrd [r5], m4, 2<br>
+ pextrd [r5 + r3], m4, 3<br>
+%else<br>
+ psubw m4, m3<br>
+ psubw m5, m3<br>
+<br>
+ movlps [r2], m4<br>
+ movhps [r2 + r3], m4<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movlps [r5], m5<br>
+ movhps [r5 + r3], m5<br>
+%endif<br>
+<br>
+ lea r5, [4 * r1 + 8]<br>
+ sub r0, r5<br>
+%ifidn %3,pp<br>
+ lea r2, [r2 + 4 * r3 - 8]<br>
+%else<br>
+ lea r2, [r2 + 4 * r3 - 16]<br>
+%endif<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_12xN 12, 16, pp<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+FILTER_VER_LUMA_12xN 12, 16, ps<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA 3<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-gprsize<br>
+ lea r5, [3 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+%ifidn %3,ps<br>
+ add r3d, r3d<br>
+%endif<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffVer]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffVer + r4]<br>
+%endif<br>
+<br>
+%ifidn %3,pp<br>
+ mova m3, [pw_512]<br>
+%else<br>
+ mova m3, [pw_2000]<br>
+%endif<br>
+ mov dword [rsp], %2/4<br>
+<br>
+.loopH:<br>
+ mov r4d, (%1/8)<br>
+.loopW:<br>
+ PROCESS_LUMA_W8_4R<br>
+%ifidn %3,pp<br>
+ pmulhrsw m7, m3<br>
+ pmulhrsw m6, m3<br>
+ pmulhrsw m5, m3<br>
+ pmulhrsw m4, m3<br>
+<br>
+ packuswb m7, m6<br>
+ packuswb m5, m4<br>
+<br>
+ movlps [r2], m7<br>
+ movhps [r2 + r3], m7<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movlps [r5], m5<br>
+ movhps [r5 + r3], m5<br>
+%else<br>
+ psubw m7, m3<br>
+ psubw m6, m3<br>
+ psubw m5, m3<br>
+ psubw m4, m3<br>
+<br>
+ movu [r2], m7<br>
+ movu [r2 + r3], m6<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movu [r5], m5<br>
+ movu [r5 + r3], m4<br>
+%endif<br>
+<br>
+ lea r5, [8 * r1 - 8]<br>
+ sub r0, r5<br>
+%ifidn %3,pp<br>
+ add r2, 8<br>
+%else<br>
+ add r2, 16<br>
+%endif<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - %1]<br>
+%ifidn %3,pp<br>
+ lea r2, [r2 + 4 * r3 - %1]<br>
+%else<br>
+ lea r2, [r2 + 4 * r3 - 2 * %1]<br>
+%endif<br>
+<br>
+ dec dword [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_LUMA 16, 4, pp<br>
+FILTER_VER_LUMA 16, 8, pp<br>
+FILTER_VER_LUMA 16, 12, pp<br>
+FILTER_VER_LUMA 16, 16, pp<br>
+FILTER_VER_LUMA 16, 32, pp<br>
+FILTER_VER_LUMA 16, 64, pp<br>
+FILTER_VER_LUMA 24, 32, pp<br>
+FILTER_VER_LUMA 32, 8, pp<br>
+FILTER_VER_LUMA 32, 16, pp<br>
+FILTER_VER_LUMA 32, 24, pp<br>
+FILTER_VER_LUMA 32, 32, pp<br>
+FILTER_VER_LUMA 32, 64, pp<br>
+FILTER_VER_LUMA 48, 64, pp<br>
+FILTER_VER_LUMA 64, 16, pp<br>
+FILTER_VER_LUMA 64, 32, pp<br>
+FILTER_VER_LUMA 64, 48, pp<br>
+FILTER_VER_LUMA 64, 64, pp<br>
+<br>
+FILTER_VER_LUMA 16, 4, ps<br>
+FILTER_VER_LUMA 16, 8, ps<br>
+FILTER_VER_LUMA 16, 12, ps<br>
+FILTER_VER_LUMA 16, 16, ps<br>
+FILTER_VER_LUMA 16, 32, ps<br>
+FILTER_VER_LUMA 16, 64, ps<br>
+FILTER_VER_LUMA 24, 32, ps<br>
+FILTER_VER_LUMA 32, 8, ps<br>
+FILTER_VER_LUMA 32, 16, ps<br>
+FILTER_VER_LUMA 32, 24, ps<br>
+FILTER_VER_LUMA 32, 32, ps<br>
+FILTER_VER_LUMA 32, 64, ps<br>
+FILTER_VER_LUMA 48, 64, ps<br>
+FILTER_VER_LUMA 64, 16, ps<br>
+FILTER_VER_LUMA 64, 32, ps<br>
+FILTER_VER_LUMA 64, 48, ps<br>
+FILTER_VER_LUMA 64, 64, ps<br>
+<br>
+%macro PROCESS_LUMA_SP_W4_4R 0<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m1, m4 ;m1=[1 2]<br>
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[2 3]<br>
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
+ pmaddwd m4, [r6 + 1 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[3 4]<br>
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
+ pmaddwd m5, [r6 + 1 * 16]<br>
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[4 5]<br>
+ pmaddwd m6, m4, [r6 + 1 * 16]<br>
+ paddd m2, m6 ;m2=[2+3+4+5] Row3<br>
+ pmaddwd m4, [r6 + 2 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[5 6]<br>
+ pmaddwd m6, m5, [r6 + 1 * 16]<br>
+ paddd m3, m6 ;m3=[3+4+5+6] Row4<br>
+ pmaddwd m5, [r6 + 2 * 16]<br>
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[6 7]<br>
+ pmaddwd m6, m4, [r6 + 2 * 16]<br>
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3<br>
+ pmaddwd m4, [r6 + 3 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[7 8]<br>
+ pmaddwd m6, m5, [r6 + 2 * 16]<br>
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4<br>
+ pmaddwd m5, [r6 + 3 * 16]<br>
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[8 9]<br>
+ pmaddwd m4, [r6 + 3 * 16]<br>
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end<br>
+<br>
+ movq m4, [r0 + 2 * r1]<br>
+ punpcklwd m5, m4 ;m5=[9 10]<br>
+ pmaddwd m5, [r6 + 3 * 16]<br>
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end<br>
+%endmacro<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_SP 2<br>
+INIT_XMM sse4<br>
+cglobal interp_8tap_vert_sp_%1x%2, 5, 7, 8 ,0-gprsize<br>
+<br>
+ add r1d, r1d<br>
+ lea r5, [r1 + 2 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m7, [tab_c_526336]<br>
+<br>
+ mov dword [rsp], %2/4<br>
+.loopH:<br>
+ mov r4d, (%1/4)<br>
+.loopW:<br>
+ PROCESS_LUMA_SP_W4_4R<br>
+<br>
+ paddd m0, m7<br>
+ paddd m1, m7<br>
+ paddd m2, m7<br>
+ paddd m3, m7<br>
+<br>
+ psrad m0, 12<br>
+ psrad m1, 12<br>
+ psrad m2, 12<br>
+ psrad m3, 12<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ packuswb m0, m2<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+ lea r5, [r2 + 2 * r3]<br>
+ pextrd [r5], m0, 2<br>
+ pextrd [r5 + r3], m0, 3<br>
+<br>
+ lea r5, [8 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 4<br>
+<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - 2 * %1]<br>
+ lea r2, [r2 + 4 * r3 - %1]<br>
+<br>
+ dec dword [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+ FILTER_VER_LUMA_SP 4, 4<br>
+ FILTER_VER_LUMA_SP 8, 8<br>
+ FILTER_VER_LUMA_SP 8, 4<br>
+ FILTER_VER_LUMA_SP 4, 8<br>
+ FILTER_VER_LUMA_SP 16, 16<br>
+ FILTER_VER_LUMA_SP 16, 8<br>
+ FILTER_VER_LUMA_SP 8, 16<br>
+ FILTER_VER_LUMA_SP 16, 12<br>
+ FILTER_VER_LUMA_SP 12, 16<br>
+ FILTER_VER_LUMA_SP 16, 4<br>
+ FILTER_VER_LUMA_SP 4, 16<br>
+ FILTER_VER_LUMA_SP 32, 32<br>
+ FILTER_VER_LUMA_SP 32, 16<br>
+ FILTER_VER_LUMA_SP 16, 32<br>
+ FILTER_VER_LUMA_SP 32, 24<br>
+ FILTER_VER_LUMA_SP 24, 32<br>
+ FILTER_VER_LUMA_SP 32, 8<br>
+ FILTER_VER_LUMA_SP 8, 32<br>
+ FILTER_VER_LUMA_SP 64, 64<br>
+ FILTER_VER_LUMA_SP 64, 32<br>
+ FILTER_VER_LUMA_SP 32, 64<br>
+ FILTER_VER_LUMA_SP 64, 48<br>
+ FILTER_VER_LUMA_SP 48, 64<br>
+ FILTER_VER_LUMA_SP 64, 16<br>
+ FILTER_VER_LUMA_SP 16, 64<br>
+<br>
+; TODO: combin of U and V is more performance, but need more register<br>
+; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it<br>
+INIT_XMM ssse3<br>
+cglobal chroma_p2s, 3, 7, 4<br>
+<br>
+ ; load width and height<br>
+ mov r3d, r3m<br>
+ mov r4d, r4m<br>
+<br>
+ ; load constant<br>
+ mova m2, [pb_128]<br>
+ mova m3, [tab_c_64_n64]<br>
+<br>
+.loopH:<br>
+<br>
+ xor r5d, r5d<br>
+.loopW:<br>
+ lea r6, [r0 + r5]<br>
+<br>
+ movh m0, [r6]<br>
+ punpcklbw m0, m2<br>
+ pmaddubsw m0, m3<br>
+<br>
+ movh m1, [r6 + r1]<br>
+ punpcklbw m1, m2<br>
+ pmaddubsw m1, m3<br>
+<br>
+ add r5d, 8<br>
+ cmp r5d, r3d<br>
+ lea r6, [r2 + r5 * 2]<br>
+ jg .width4<br>
+ movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
+ movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
+ je .nextH<br>
+ jmp .loopW<br>
+<br>
+.width4:<br>
+ test r3d, 4<br>
+ jz .width2<br>
+ test r3d, 2<br>
+ movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
+ movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
+ lea r6, [r6 + 8]<br>
+ pshufd m0, m0, 2<br>
+ pshufd m1, m1, 2<br>
+ jz .nextH<br>
+<br>
+.width2:<br>
+ movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0<br>
+ movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1<br>
+<br>
+.nextH:<br>
+ lea r0, [r0 + r1 * 2]<br>
+ add r2, FENC_STRIDE / 2 * 4<br>
+<br>
+ sub r4d, 2<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+<br>
+%macro PROCESS_CHROMA_SP_W4_4R 0<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m1, m4 ;m1=[1 2]<br>
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[2 3]<br>
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
+ pmaddwd m4, [r6 + 1 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3] Row1 done<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[3 4]<br>
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
+ pmaddwd m5, [r6 + 1 * 16]<br>
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[4 5]<br>
+ pmaddwd m4, [r6 + 1 * 16]<br>
+ paddd m2, m4 ;m2=[2+3+4+5] Row3<br>
+<br>
+ movq m4, [r0 + 2 * r1]<br>
+ punpcklwd m5, m4 ;m5=[5 6]<br>
+ pmaddwd m5, [r6 + 1 * 16]<br>
+ paddd m3, m5 ;m3=[3+4+5+6] Row4<br>
+%endmacro<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SP 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_sp_%1x%2, 5, 7, 7 ,0-gprsize<br>
+<br>
+ add r1d, r1d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m6, [tab_c_526336]<br>
+<br>
+ mov dword [rsp], %2/4<br>
+<br>
+.loopH:<br>
+ mov r4d, (%1/4)<br>
+.loopW:<br>
+ PROCESS_CHROMA_SP_W4_4R<br>
+<br>
+ paddd m0, m6<br>
+ paddd m1, m6<br>
+ paddd m2, m6<br>
+ paddd m3, m6<br>
+<br>
+ psrad m0, 12<br>
+ psrad m1, 12<br>
+ psrad m2, 12<br>
+ psrad m3, 12<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ packuswb m0, m2<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+ lea r5, [r2 + 2 * r3]<br>
+ pextrd [r5], m0, 2<br>
+ pextrd [r5 + r3], m0, 3<br>
+<br>
+ lea r5, [4 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 4<br>
+<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - 2 * %1]<br>
+ lea r2, [r2 + 4 * r3 - %1]<br>
+<br>
+ dec dword [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+ FILTER_VER_CHROMA_SP 4, 4<br>
+ FILTER_VER_CHROMA_SP 4, 8<br>
+ FILTER_VER_CHROMA_SP 16, 16<br>
+ FILTER_VER_CHROMA_SP 16, 8<br>
+ FILTER_VER_CHROMA_SP 16, 12<br>
+ FILTER_VER_CHROMA_SP 12, 16<br>
+ FILTER_VER_CHROMA_SP 16, 4<br>
+ FILTER_VER_CHROMA_SP 4, 16<br>
+ FILTER_VER_CHROMA_SP 32, 32<br>
+ FILTER_VER_CHROMA_SP 32, 16<br>
+ FILTER_VER_CHROMA_SP 16, 32<br>
+ FILTER_VER_CHROMA_SP 32, 24<br>
+ FILTER_VER_CHROMA_SP 24, 32<br>
+ FILTER_VER_CHROMA_SP 32, 8<br>
+<br>
+ FILTER_VER_CHROMA_SP 16, 24<br>
+ FILTER_VER_CHROMA_SP 16, 64<br>
+ FILTER_VER_CHROMA_SP 12, 32<br>
+ FILTER_VER_CHROMA_SP 4, 32<br>
+ FILTER_VER_CHROMA_SP 32, 64<br>
+ FILTER_VER_CHROMA_SP 32, 48<br>
+ FILTER_VER_CHROMA_SP 24, 64<br>
+<br>
+ FILTER_VER_CHROMA_SP 64, 64<br>
+ FILTER_VER_CHROMA_SP 64, 32<br>
+ FILTER_VER_CHROMA_SP 64, 48<br>
+ FILTER_VER_CHROMA_SP 48, 64<br>
+ FILTER_VER_CHROMA_SP 64, 16<br>
+<br>
+<br>
+%macro PROCESS_CHROMA_SP_W2_4R 1<br>
+ movd m0, [r0]<br>
+ movd m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m2, [r0]<br>
+ punpcklwd m1, m2 ;m1=[1 2]<br>
+ punpcklqdq m0, m1 ;m0=[0 1 1 2]<br>
+ pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklwd m2, m1 ;m2=[2 3]<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movd m3, [r0]<br>
+ punpcklwd m1, m3 ;m2=[3 4]<br>
+ punpcklqdq m2, m1 ;m2=[2 3 3 4]<br>
+<br>
+ pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2<br>
+ pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4<br>
+ paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2<br>
+<br>
+ movd m1, [r0 + r1]<br>
+ punpcklwd m3, m1 ;m3=[4 5]<br>
+<br>
+ movd m4, [r0 + 2 * r1]<br>
+ punpcklwd m1, m4 ;m1=[5 6]<br>
+ punpcklqdq m3, m1 ;m2=[4 5 5 6]<br>
+ pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4<br>
+ paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4<br>
+%endmacro<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vertical_sp_%1x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SP_W2_4R 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 6<br>
+<br>
+ add r1d, r1d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m5, [tab_c_526336]<br>
+<br>
+ mov r4d, (%2/4)<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W2_4R r5<br>
+<br>
+ paddd m0, m5<br>
+ paddd m2, m5<br>
+<br>
+ psrad m0, 12<br>
+ psrad m2, 12<br>
+<br>
+ packssdw m0, m2<br>
+ packuswb m0, m0<br>
+<br>
+ pextrw [r2], m0, 0<br>
+ pextrw [r2 + r3], m0, 1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ pextrw [r2], m0, 2<br>
+ pextrw [r2 + r3], m0, 3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SP_W2_4R 2, 4<br>
+FILTER_VER_CHROMA_SP_W2_4R 2, 8<br>
+<br>
+FILTER_VER_CHROMA_SP_W2_4R 2, 16<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_sp_4x2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_sp_4x2, 5, 6, 5<br>
+<br>
+ add r1d, r1d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m4, [tab_c_526336]<br>
+<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m2, [r0]<br>
+ punpcklwd m1, m2 ;m1=[1 2]<br>
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2<br>
+<br>
+ movq m3, [r0 + r1]<br>
+ punpcklwd m2, m3 ;m4=[2 3]<br>
+ pmaddwd m2, [r5 + 1 * 16]<br>
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done<br>
+ paddd m0, m4<br>
+ psrad m0, 12<br>
+<br>
+ movq m2, [r0 + 2 * r1]<br>
+ punpcklwd m3, m2 ;m5=[3 4]<br>
+ pmaddwd m3, [r5 + 1 * 16]<br>
+ paddd m1, m3 ;m1 = [1+2+3+4] Row2 done<br>
+ paddd m1, m4<br>
+ psrad m1, 12<br>
+<br>
+ packssdw m0, m1<br>
+ packuswb m0, m0<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+<br>
+ RET<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vertical_sp_6x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SP_W6_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_sp_6x%2, 5, 7, 7<br>
+<br>
+ add r1d, r1d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m6, [tab_c_526336]<br>
+<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W4_4R<br>
+<br>
+ paddd m0, m6<br>
+ paddd m1, m6<br>
+ paddd m2, m6<br>
+ paddd m3, m6<br>
+<br>
+ psrad m0, 12<br>
+ psrad m1, 12<br>
+ psrad m2, 12<br>
+ psrad m3, 12<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ packuswb m0, m2<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+ lea r5, [r2 + 2 * r3]<br>
+ pextrd [r5], m0, 2<br>
+ pextrd [r5 + r3], m0, 3<br>
+<br>
+ lea r5, [4 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 4<br>
+<br>
+ PROCESS_CHROMA_SP_W2_4R r6<br>
+<br>
+ paddd m0, m6<br>
+ paddd m2, m6<br>
+<br>
+ psrad m0, 12<br>
+ psrad m2, 12<br>
+<br>
+ packssdw m0, m2<br>
+ packuswb m0, m0<br>
+<br>
+ pextrw [r2], m0, 0<br>
+ pextrw [r2 + r3], m0, 1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ pextrw [r2], m0, 2<br>
+ pextrw [r2 + r3], m0, 3<br>
+<br>
+ sub r0, 2 * 4<br>
+ lea r2, [r2 + 2 * r3 - 4]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SP_W6_H4 6, 8<br>
+<br>
+FILTER_VER_CHROMA_SP_W6_H4 6, 16<br>
+<br>
+%macro PROCESS_CHROMA_SP_W8_2R 0<br>
+ movu m1, [r0]<br>
+ movu m3, [r0 + r1]<br>
+ punpcklwd m0, m1, m3<br>
+ pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l<br>
+ punpckhwd m1, m3<br>
+ pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h<br>
+<br>
+ movu m4, [r0 + 2 * r1]<br>
+ punpcklwd m2, m3, m4<br>
+ pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l<br>
+ punpckhwd m3, m4<br>
+ pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movu m5, [r0 + r1]<br>
+ punpcklwd m6, m4, m5<br>
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l<br>
+ paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum<br>
+ punpckhwd m4, m5<br>
+ pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h<br>
+ paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum<br>
+<br>
+ movu m4, [r0 + 2 * r1]<br>
+ punpcklwd m6, m5, m4<br>
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l<br>
+ paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum<br>
+ punpckhwd m5, m4<br>
+ pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h<br>
+ paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum<br>
+%endmacro<br>
+<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_sp_8x%2(int16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)<br>
+;--------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SP_W8_H2 2<br>
+INIT_XMM sse2<br>
+cglobal interp_4tap_vert_sp_%1x%2, 5, 6, 8<br>
+<br>
+ add r1d, r1d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mova m7, [tab_c_526336]<br>
+<br>
+ mov r4d, %2/2<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W8_2R<br>
+<br>
+ paddd m0, m7<br>
+ paddd m1, m7<br>
+ paddd m2, m7<br>
+ paddd m3, m7<br>
+<br>
+ psrad m0, 12<br>
+ psrad m1, 12<br>
+ psrad m2, 12<br>
+ psrad m3, 12<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ packuswb m0, m2<br>
+<br>
+ movlps [r2], m0<br>
+ movhps [r2 + r3], m0<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 2<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 4<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 6<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 8<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 16<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 32<br>
+<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 12<br>
+FILTER_VER_CHROMA_SP_W8_H2 8, 64<br>
+<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_HORIZ_CHROMA_2xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride<br>
+%define coef2 m3<br>
+%define Tm0 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+ dec srcq<br>
+ mov r4d, r4m<br>
+ add dststrided, dststrided<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_ChromaCoeff]<br>
+ movd coef2, [r6 + r4 * 4]<br>
+%else<br>
+ movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufd coef2, coef2, 0<br>
+ mova t1, [pw_2000]<br>
+ mova Tm0, [tab_Tm]<br>
+<br>
+ mov r4d, %2<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ sub srcq, srcstrideq<br>
+ add r4d, 3<br>
+<br>
+.loopH:<br>
+ movh t0, [srcq]<br>
+ pshufb t0, t0, Tm0<br>
+ pmaddubsw t0, coef2<br>
+ phaddw t0, t0<br>
+ psubw t0, t1<br>
+ movd [dstq], t0<br>
+<br>
+ lea srcq, [srcq + srcstrideq]<br>
+ lea dstq, [dstq + dststrideq]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_HORIZ_CHROMA_2xN 2, 4<br>
+FILTER_HORIZ_CHROMA_2xN 2, 8<br>
+<br>
+FILTER_HORIZ_CHROMA_2xN 2, 16<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_HORIZ_CHROMA_4xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride<br>
+%define coef2 m3<br>
+%define Tm0 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+ dec srcq<br>
+ mov r4d, r4m<br>
+ add dststrided, dststrided<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_ChromaCoeff]<br>
+ movd coef2, [r6 + r4 * 4]<br>
+%else<br>
+ movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufd coef2, coef2, 0<br>
+ mova t1, [pw_2000]<br>
+ mova Tm0, [tab_Tm]<br>
+<br>
+ mov r4d, %2<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ sub srcq, srcstrideq<br>
+ add r4d, 3<br>
+<br>
+.loopH:<br>
+ movh t0, [srcq]<br>
+ pshufb t0, t0, Tm0<br>
+ pmaddubsw t0, coef2<br>
+ phaddw t0, t0<br>
+ psubw t0, t1<br>
+ movlps [dstq], t0<br>
+<br>
+ lea srcq, [srcq + srcstrideq]<br>
+ lea dstq, [dstq + dststrideq]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_HORIZ_CHROMA_4xN 4, 2<br>
+FILTER_HORIZ_CHROMA_4xN 4, 4<br>
+FILTER_HORIZ_CHROMA_4xN 4, 8<br>
+FILTER_HORIZ_CHROMA_4xN 4, 16<br>
+<br>
+FILTER_HORIZ_CHROMA_4xN 4, 32<br>
+<br>
+%macro PROCESS_CHROMA_W6 3<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ psubw %2, %3<br>
+ movh [dstq], %2<br>
+ pshufd %2, %2, 2<br>
+ movd [dstq + 8], %2<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W12 3<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ psubw %2, %3<br>
+ movu [dstq], %2<br>
+ movu %1, [srcq + 8]<br>
+ pshufb %1, %1, Tm0<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %1, %1<br>
+ psubw %1, %3<br>
+ movh [dstq + 16], %1<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_HORIZ_CHROMA 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride<br>
+%define coef2 m5<br>
+%define Tm0 m4<br>
+%define Tm1 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+ dec srcq<br>
+ mov r4d, r4m<br>
+ add dststrided, dststrided<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_ChromaCoeff]<br>
+ movd coef2, [r6 + r4 * 4]<br>
+%else<br>
+ movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufd coef2, coef2, 0<br>
+ mova t2, [pw_2000]<br>
+ mova Tm0, [tab_Tm]<br>
+ mova Tm1, [tab_Tm + 16]<br>
+<br>
+ mov r4d, %2<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ sub srcq, srcstrideq<br>
+ add r4d, 3<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_W%1 t0, t1, t2<br>
+ add srcq, srcstrideq<br>
+ add dstq, dststrideq<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_HORIZ_CHROMA 6, 8<br>
+FILTER_HORIZ_CHROMA 12, 16<br>
+<br>
+FILTER_HORIZ_CHROMA 6, 16<br>
+FILTER_HORIZ_CHROMA 12, 32<br>
+<br>
+%macro PROCESS_CHROMA_W8 3<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ psubw %2, %3<br>
+ movu [dstq], %2<br>
+%endmacro<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;-----------------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_HORIZ_CHROMA_8xN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride<br>
+%define coef2 m5<br>
+%define Tm0 m4<br>
+%define Tm1 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+ dec srcq<br>
+ mov r4d, r4m<br>
+ add dststrided, dststrided<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_ChromaCoeff]<br>
+ movd coef2, [r6 + r4 * 4]<br>
+%else<br>
+ movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufd coef2, coef2, 0<br>
+ mova t2, [pw_2000]<br>
+ mova Tm0, [tab_Tm]<br>
+ mova Tm1, [tab_Tm + 16]<br>
+<br>
+ mov r4d, %2<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ sub srcq, srcstrideq<br>
+ add r4d, 3<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_W8 t0, t1, t2<br>
+ add srcq, srcstrideq<br>
+ add dstq, dststrideq<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_HORIZ_CHROMA_8xN 8, 2<br>
+FILTER_HORIZ_CHROMA_8xN 8, 4<br>
+FILTER_HORIZ_CHROMA_8xN 8, 6<br>
+FILTER_HORIZ_CHROMA_8xN 8, 8<br>
+FILTER_HORIZ_CHROMA_8xN 8, 16<br>
+FILTER_HORIZ_CHROMA_8xN 8, 32<br>
+<br>
+FILTER_HORIZ_CHROMA_8xN 8, 12<br>
+FILTER_HORIZ_CHROMA_8xN 8, 64<br>
+<br>
+%macro PROCESS_CHROMA_W16 4<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ psubw %2, %3<br>
+ psubw %4, %3<br>
+ movu [dstq], %2<br>
+ movu [dstq + 16], %4<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W24 4<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ psubw %2, %3<br>
+ psubw %4, %3<br>
+ movu [dstq], %2<br>
+ movu [dstq + 16], %4<br>
+ movu %1, [srcq + 16]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ psubw %2, %3<br>
+ movu [dstq + 32], %2<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W32 4<br>
+ movu %1, [srcq]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ psubw %2, %3<br>
+ psubw %4, %3<br>
+ movu [dstq], %2<br>
+ movu [dstq + 16], %4<br>
+ movu %1, [srcq + 16]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + 24]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ psubw %2, %3<br>
+ psubw %4, %3<br>
+ movu [dstq + 32], %2<br>
+ movu [dstq + 48], %4<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W16o 5<br>
+ movu %1, [srcq + %5]<br>
+ pshufb %2, %1, Tm0<br>
+ pmaddubsw %2, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %2, %1<br>
+ movu %1, [srcq + %5 + 8]<br>
+ pshufb %4, %1, Tm0<br>
+ pmaddubsw %4, coef2<br>
+ pshufb %1, %1, Tm1<br>
+ pmaddubsw %1, coef2<br>
+ phaddw %4, %1<br>
+ psubw %2, %3<br>
+ psubw %4, %3<br>
+ movu [dstq + %5 * 2], %2<br>
+ movu [dstq + %5 * 2 + 16], %4<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W48 4<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32<br>
+%endmacro<br>
+<br>
+%macro PROCESS_CHROMA_W64 4<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 0<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 16<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 32<br>
+ PROCESS_CHROMA_W16o %1, %2, %3, %4, 48<br>
+%endmacro<br>
+<br>
+;------------------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+;------------------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_HORIZ_CHROMA_WxN 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride<br>
+%define coef2 m6<br>
+%define Tm0 m5<br>
+%define Tm1 m4<br>
+%define t3 m3<br>
+%define t2 m2<br>
+%define t1 m1<br>
+%define t0 m0<br>
+<br>
+ dec srcq<br>
+ mov r4d, r4m<br>
+ add dststrided, dststrided<br>
+<br>
+%ifdef PIC<br>
+ lea r6, [tab_ChromaCoeff]<br>
+ movd coef2, [r6 + r4 * 4]<br>
+%else<br>
+ movd coef2, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufd coef2, coef2, 0<br>
+ mova t2, [pw_2000]<br>
+ mova Tm0, [tab_Tm]<br>
+ mova Tm1, [tab_Tm + 16]<br>
+<br>
+ mov r4d, %2<br>
+ cmp r5m, byte 0<br>
+ je .loopH<br>
+ sub srcq, srcstrideq<br>
+ add r4d, 3<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_W%1 t0, t1, t2, t3<br>
+ add srcq, srcstrideq<br>
+ add dstq, dststrideq<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_HORIZ_CHROMA_WxN 16, 4<br>
+FILTER_HORIZ_CHROMA_WxN 16, 8<br>
+FILTER_HORIZ_CHROMA_WxN 16, 12<br>
+FILTER_HORIZ_CHROMA_WxN 16, 16<br>
+FILTER_HORIZ_CHROMA_WxN 16, 32<br>
+FILTER_HORIZ_CHROMA_WxN 24, 32<br>
+FILTER_HORIZ_CHROMA_WxN 32, 8<br>
+FILTER_HORIZ_CHROMA_WxN 32, 16<br>
+FILTER_HORIZ_CHROMA_WxN 32, 24<br>
+FILTER_HORIZ_CHROMA_WxN 32, 32<br>
+<br>
+FILTER_HORIZ_CHROMA_WxN 16, 24<br>
+FILTER_HORIZ_CHROMA_WxN 16, 64<br>
+FILTER_HORIZ_CHROMA_WxN 24, 64<br>
+FILTER_HORIZ_CHROMA_WxN 32, 48<br>
+FILTER_HORIZ_CHROMA_WxN 32, 64<br>
+<br>
+FILTER_HORIZ_CHROMA_WxN 64, 64<br>
+FILTER_HORIZ_CHROMA_WxN 64, 32<br>
+FILTER_HORIZ_CHROMA_WxN 64, 48<br>
+FILTER_HORIZ_CHROMA_WxN 48, 64<br>
+FILTER_HORIZ_CHROMA_WxN 64, 16<br>
+<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W16n 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_%1x%2, 4, 7, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m1, m0, [tab_Vm]<br>
+ pshufb m0, [tab_Vm + 16]<br>
+ mov r4d, %2/2<br>
+<br>
+.loop:<br>
+<br>
+ mov r6d, %1/16<br>
+<br>
+.loopW:<br>
+<br>
+ movu m2, [r0]<br>
+ movu m3, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m2, m3<br>
+ punpckhbw m2, m3<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m2, m1<br>
+<br>
+ lea r5, [r0 + 2 * r1]<br>
+ movu m5, [r5]<br>
+ movu m7, [r5 + r1]<br>
+<br>
+ punpcklbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m4, m6<br>
+<br>
+ punpckhbw m6, m5, m7<br>
+ pmaddubsw m6, m0<br>
+ paddw m2, m6<br>
+<br>
+ mova m6, [pw_2000]<br>
+<br>
+ psubw m4, m6<br>
+ psubw m2, m6<br>
+<br>
+ movu [r2], m4<br>
+ movu [r2 + 16], m2<br>
+<br>
+ punpcklbw m4, m3, m5<br>
+ punpckhbw m3, m5<br>
+<br>
+ pmaddubsw m4, m1<br>
+ pmaddubsw m3, m1<br>
+<br>
+ movu m5, [r5 + 2 * r1]<br>
+<br>
+ punpcklbw m2, m7, m5<br>
+ punpckhbw m7, m5<br>
+<br>
+ pmaddubsw m2, m0<br>
+ pmaddubsw m7, m0<br>
+<br>
+ paddw m4, m2<br>
+ paddw m3, m7<br>
+<br>
+ psubw m4, m6<br>
+ psubw m3, m6<br>
+<br>
+ movu [r2 + r3], m4<br>
+ movu [r2 + r3 + 16], m3<br>
+<br>
+ add r0, 16<br>
+ add r2, 32<br>
+ dec r6d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + r1 * 2 - %1]<br>
+ lea r2, [r2 + r3 * 2 - %1 * 2]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W16n 64, 64<br>
+FILTER_V_PS_W16n 64, 32<br>
+FILTER_V_PS_W16n 64, 48<br>
+FILTER_V_PS_W16n 48, 64<br>
+FILTER_V_PS_W16n 64, 16<br>
+<br>
+<br>
+;------------------------------------------------------------------------------------------------------------<br>
+;void interp_4tap_vert_ps_2x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;------------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_2x4, 4, 6, 7<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m0, [tab_Cm]<br>
+<br>
+ lea r5, [3 * r1]<br>
+<br>
+ movd m2, [r0]<br>
+ movd m3, [r0 + r1]<br>
+ movd m4, [r0 + 2 * r1]<br>
+ movd m5, [r0 + r5]<br>
+<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m6, m4, m5<br>
+ punpcklbw m2, m6<br>
+<br>
+ pmaddubsw m2, m0<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movd m6, [r0]<br>
+<br>
+ punpcklbw m3, m4<br>
+ punpcklbw m1, m5, m6<br>
+ punpcklbw m3, m1<br>
+<br>
+ pmaddubsw m3, m0<br>
+ phaddw m2, m3<br>
+<br>
+ mova m1, [pw_2000]<br>
+<br>
+ psubw m2, m1<br>
+<br>
+ movd [r2], m2<br>
+ pextrd [r2 + r3], m2, 2<br>
+<br>
+ movd m2, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m5<br>
+ punpcklbw m3, m6, m2<br>
+ punpcklbw m4, m3<br>
+<br>
+ pmaddubsw m4, m0<br>
+<br>
+ movd m3, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m6<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m5, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+ phaddw m4, m5<br>
+ psubw m4, m1<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movd [r2], m4<br>
+ pextrd [r2 + r3], m4, 2<br>
+<br>
+ RET<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ps_2x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_V_PS_W2 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ps_2x%2, 4, 6, 8<br>
+<br>
+ mov r4d, r4m<br>
+ sub r0, r1<br>
+ add r3d, r3d<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeff]<br>
+ movd m0, [r5 + r4 * 4]<br>
+%else<br>
+ movd m0, [tab_ChromaCoeff + r4 * 4]<br>
+%endif<br>
+<br>
+ pshufb m0, [tab_Cm]<br>
+<br>
+ mova m1, [pw_2000]<br>
+ lea r5, [3 * r1]<br>
+ mov r4d, %2/4<br>
+.loop:<br>
+ movd m2, [r0]<br>
+ movd m3, [r0 + r1]<br>
+ movd m4, [r0 + 2 * r1]<br>
+ movd m5, [r0 + r5]<br>
+<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m6, m4, m5<br>
+ punpcklbw m2, m6<br>
+<br>
+ pmaddubsw m2, m0<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movd m6, [r0]<br>
+<br>
+ punpcklbw m3, m4<br>
+ punpcklbw m7, m5, m6<br>
+ punpcklbw m3, m7<br>
+<br>
+ pmaddubsw m3, m0<br>
+<br>
+ phaddw m2, m3<br>
+ psubw m2, m1<br>
+<br>
+<br>
+ movd [r2], m2<br>
+ pshufd m2, m2, 2<br>
+ movd [r2 + r3], m2<br>
+<br>
+ movd m2, [r0 + r1]<br>
+<br>
+ punpcklbw m4, m5<br>
+ punpcklbw m3, m6, m2<br>
+ punpcklbw m4, m3<br>
+<br>
+ pmaddubsw m4, m0<br>
+<br>
+ movd m3, [r0 + 2 * r1]<br>
+<br>
+ punpcklbw m5, m6<br>
+ punpcklbw m2, m3<br>
+ punpcklbw m5, m2<br>
+<br>
+ pmaddubsw m5, m0<br>
+<br>
+ phaddw m4, m5<br>
+<br>
+ psubw m4, m1<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+ movd [r2], m4<br>
+ pshufd m4 , m4 ,2<br>
+ movd [r2 + r3], m4<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loop<br>
+<br>
+RET<br>
+%endmacro<br>
+<br>
+FILTER_V_PS_W2 2, 8<br>
+<br>
+FILTER_V_PS_W2 2, 16<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SS 2<br>
+INIT_XMM sse2<br>
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-gprsize<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mov dword [rsp], %2/4<br>
+<br>
+.loopH:<br>
+ mov r4d, (%1/4)<br>
+.loopW:<br>
+ PROCESS_CHROMA_SP_W4_4R<br>
+<br>
+ psrad m0, 6<br>
+ psrad m1, 6<br>
+ psrad m2, 6<br>
+ psrad m3, 6<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ movlps [r2], m0<br>
+ movhps [r2 + r3], m0<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movlps [r5], m2<br>
+ movhps [r5 + r3], m2<br>
+<br>
+ lea r5, [4 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 2 * 4<br>
+<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - 2 * %1]<br>
+ lea r2, [r2 + 4 * r3 - 2 * %1]<br>
+<br>
+ dec dword [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+ FILTER_VER_CHROMA_SS 4, 4<br>
+ FILTER_VER_CHROMA_SS 4, 8<br>
+ FILTER_VER_CHROMA_SS 16, 16<br>
+ FILTER_VER_CHROMA_SS 16, 8<br>
+ FILTER_VER_CHROMA_SS 16, 12<br>
+ FILTER_VER_CHROMA_SS 12, 16<br>
+ FILTER_VER_CHROMA_SS 16, 4<br>
+ FILTER_VER_CHROMA_SS 4, 16<br>
+ FILTER_VER_CHROMA_SS 32, 32<br>
+ FILTER_VER_CHROMA_SS 32, 16<br>
+ FILTER_VER_CHROMA_SS 16, 32<br>
+ FILTER_VER_CHROMA_SS 32, 24<br>
+ FILTER_VER_CHROMA_SS 24, 32<br>
+ FILTER_VER_CHROMA_SS 32, 8<br>
+<br>
+ FILTER_VER_CHROMA_SS 16, 24<br>
+ FILTER_VER_CHROMA_SS 12, 32<br>
+ FILTER_VER_CHROMA_SS 4, 32<br>
+ FILTER_VER_CHROMA_SS 32, 64<br>
+ FILTER_VER_CHROMA_SS 16, 64<br>
+ FILTER_VER_CHROMA_SS 32, 48<br>
+ FILTER_VER_CHROMA_SS 24, 64<br>
+<br>
+ FILTER_VER_CHROMA_SS 64, 64<br>
+ FILTER_VER_CHROMA_SS 64, 32<br>
+ FILTER_VER_CHROMA_SS 64, 48<br>
+ FILTER_VER_CHROMA_SS 48, 64<br>
+ FILTER_VER_CHROMA_SS 64, 16<br>
+<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SS_W2_4R 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 5<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mov r4d, (%2/4)<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W2_4R r5<br>
+<br>
+ psrad m0, 6<br>
+ psrad m2, 6<br>
+<br>
+ packssdw m0, m2<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ pextrd [r2], m0, 2<br>
+ pextrd [r2 + r3], m0, 3<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SS_W2_4R 2, 4<br>
+FILTER_VER_CHROMA_SS_W2_4R 2, 8<br>
+<br>
+FILTER_VER_CHROMA_SS_W2_4R 2, 16<br>
+<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;---------------------------------------------------------------------------------------------------------------<br>
+INIT_XMM sse2<br>
+cglobal interp_4tap_vert_ss_4x2, 5, 6, 4<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m2, [r0]<br>
+ punpcklwd m1, m2 ;m1=[1 2]<br>
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2<br>
+<br>
+ movq m3, [r0 + r1]<br>
+ punpcklwd m2, m3 ;m4=[2 3]<br>
+ pmaddwd m2, [r5 + 1 * 16]<br>
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done<br>
+ psrad m0, 6<br>
+<br>
+ movq m2, [r0 + 2 * r1]<br>
+ punpcklwd m3, m2 ;m5=[3 4]<br>
+ pmaddwd m3, [r5 + 1 * 16]<br>
+ paddd m1, m3 ;m1=[1+2+3+4] Row2 done<br>
+ psrad m1, 6<br>
+<br>
+ packssdw m0, m1<br>
+<br>
+ movlps [r2], m0<br>
+ movhps [r2 + r3], m0<br>
+<br>
+ RET<br>
+<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-------------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SS_W6_H4 2<br>
+INIT_XMM sse4<br>
+cglobal interp_4tap_vert_ss_6x%2, 5, 7, 6<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mov r4d, %2/4<br>
+<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W4_4R<br>
+<br>
+ psrad m0, 6<br>
+ psrad m1, 6<br>
+ psrad m2, 6<br>
+ psrad m3, 6<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ movlps [r2], m0<br>
+ movhps [r2 + r3], m0<br>
+ lea r5, [r2 + 2 * r3]<br>
+ movlps [r5], m2<br>
+ movhps [r5 + r3], m2<br>
+<br>
+ lea r5, [4 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 2 * 4<br>
+<br>
+ PROCESS_CHROMA_SP_W2_4R r6<br>
+<br>
+ psrad m0, 6<br>
+ psrad m2, 6<br>
+<br>
+ packssdw m0, m2<br>
+<br>
+ movd [r2], m0<br>
+ pextrd [r2 + r3], m0, 1<br>
+ lea r2, [r2 + 2 * r3]<br>
+ pextrd [r2], m0, 2<br>
+ pextrd [r2 + r3], m0, 3<br>
+<br>
+ sub r0, 2 * 4<br>
+ lea r2, [r2 + 2 * r3 - 2 * 4]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SS_W6_H4 6, 8<br>
+<br>
+FILTER_VER_CHROMA_SS_W6_H4 6, 16<br>
+<br>
+<br>
+;----------------------------------------------------------------------------------------------------------------<br>
+; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;----------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_CHROMA_SS_W8_H2 2<br>
+INIT_XMM sse2<br>
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ sub r0, r1<br>
+ shl r4d, 5<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_ChromaCoeffV]<br>
+ lea r5, [r5 + r4]<br>
+%else<br>
+ lea r5, [tab_ChromaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mov r4d, %2/2<br>
+.loopH:<br>
+ PROCESS_CHROMA_SP_W8_2R<br>
+<br>
+ psrad m0, 6<br>
+ psrad m1, 6<br>
+ psrad m2, 6<br>
+ psrad m3, 6<br>
+<br>
+ packssdw m0, m1<br>
+ packssdw m2, m3<br>
+<br>
+ movu [r2], m0<br>
+ movu [r2 + r3], m2<br>
+<br>
+ lea r2, [r2 + 2 * r3]<br>
+<br>
+ dec r4d<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 2<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 4<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 6<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 8<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 16<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 32<br>
+<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 12<br>
+FILTER_VER_CHROMA_SS_W8_H2 8, 64<br>
+<br>
+;-----------------------------------------------------------------------------------------------------------------<br>
+; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)<br>
+;-----------------------------------------------------------------------------------------------------------------<br>
+%macro FILTER_VER_LUMA_SS 2<br>
+INIT_XMM sse2<br>
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-gprsize<br>
+<br>
+ add r1d, r1d<br>
+ add r3d, r3d<br>
+ lea r5, [3 * r1]<br>
+ sub r0, r5<br>
+ shl r4d, 6<br>
+<br>
+%ifdef PIC<br>
+ lea r5, [tab_LumaCoeffV]<br>
+ lea r6, [r5 + r4]<br>
+%else<br>
+ lea r6, [tab_LumaCoeffV + r4]<br>
+%endif<br>
+<br>
+ mov dword [rsp], %2/4<br>
+.loopH:<br>
+ mov r4d, (%1/4)<br>
+.loopW:<br>
+ movq m0, [r0]<br>
+ movq m1, [r0 + r1]<br>
+ punpcklwd m0, m1 ;m0=[0 1]<br>
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m1, m4 ;m1=[1 2]<br>
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[2 3]<br>
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3<br>
+ pmaddwd m4, [r6 + 1 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[3 4]<br>
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4<br>
+ pmaddwd m5, [r6 + 1 * 16]<br>
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[4 5]<br>
+ pmaddwd m6, m4, [r6 + 1 * 16]<br>
+ paddd m2, m6 ;m2=[2+3+4+5] Row3<br>
+ pmaddwd m4, [r6 + 2 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3+4+5] Row1<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[5 6]<br>
+ pmaddwd m6, m5, [r6 + 1 * 16]<br>
+ paddd m3, m6 ;m3=[3+4+5+6] Row4<br>
+ pmaddwd m5, [r6 + 2 * 16]<br>
+ paddd m1, m5 ;m1=[1+2+3+4+5+6] Row2<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[6 7]<br>
+ pmaddwd m6, m4, [r6 + 2 * 16]<br>
+ paddd m2, m6 ;m2=[2+3+4+5+6+7] Row3<br>
+ pmaddwd m4, [r6 + 3 * 16]<br>
+ paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end<br>
+ psrad m0, 6<br>
+<br>
+ lea r0, [r0 + 2 * r1]<br>
+ movq m4, [r0]<br>
+ punpcklwd m5, m4 ;m5=[7 8]<br>
+ pmaddwd m6, m5, [r6 + 2 * 16]<br>
+ paddd m3, m6 ;m3=[3+4+5+6+7+8] Row4<br>
+ pmaddwd m5, [r6 + 3 * 16]<br>
+ paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end<br>
+ psrad m1, 6<br>
+<br>
+ packssdw m0, m1<br>
+<br>
+ movlps [r2], m0<br>
+ movhps [r2 + r3], m0<br>
+<br>
+ movq m5, [r0 + r1]<br>
+ punpcklwd m4, m5 ;m4=[8 9]<br>
+ pmaddwd m4, [r6 + 3 * 16]<br>
+ paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end<br>
+ psrad m2, 6<br>
+<br>
+ movq m4, [r0 + 2 * r1]<br>
+ punpcklwd m5, m4 ;m5=[9 10]<br>
+ pmaddwd m5, [r6 + 3 * 16]<br>
+ paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end<br>
+ psrad m3, 6<br>
+<br>
+ packssdw m2, m3<br>
+<br>
+ movlps [r2 + 2 * r3], m2<br>
+ lea r5, [3 * r3]<br>
+ movhps [r2 + r5], m2<br>
+<br>
+ lea r5, [8 * r1 - 2 * 4]<br>
+ sub r0, r5<br>
+ add r2, 2 * 4<br>
+<br>
+ dec r4d<br>
+ jnz .loopW<br>
+<br>
+ lea r0, [r0 + 4 * r1 - 2 * %1]<br>
+ lea r2, [r2 + 4 * r3 - 2 * %1]<br>
+<br>
+ dec dword [rsp]<br>
+ jnz .loopH<br>
+<br>
+ RET<br>
+%endmacro<br>
+<br>
+ FILTER_VER_LUMA_SS 4, 4<br>
+ FILTER_VER_LUMA_SS 8, 8<br>
+ FILTER_VER_LUMA_SS 8, 4<br>
+ FILTER_VER_LUMA_SS 4, 8<br>
+ FILTER_VER_LUMA_SS 16, 16<br>
+ FILTER_VER_LUMA_SS 16, 8<br>
+ FILTER_VER_LUMA_SS 8, 16<br>
+ FILTER_VER_LUMA_SS 16, 12<br>
+ FILTER_VER_LUMA_SS 12, 16<br>
+ FILTER_VER_LUMA_SS 16, 4<br>
+ FILTER_VER_LUMA_SS 4, 16<br>
+ FILTER_VER_LUMA_SS 32, 32<br>
+ FILTER_VER_LUMA_SS 32, 16<br>
+ FILTER_VER_LUMA_SS 16, 32<br>
+ FILTER_VER_LUMA_SS 32, 24<br>
+ FILTER_VER_LUMA_SS 24, 32<br>
+ FILTER_VER_LUMA_SS 32, 8<br>
+ FILTER_VER_LUMA_SS 8, 32<br>
+ FILTER_VER_LUMA_SS 64, 64<br>
+ FILTER_VER_LUMA_SS 64, 32<br>
+ FILTER_VER_LUMA_SS 32, 64<br>
+ FILTER_VER_LUMA_SS 64, 48<br>
+ FILTER_VER_LUMA_SS 48, 64<br>
+ FILTER_VER_LUMA_SS 64, 16<br>
+ FILTER_VER_LUMA_SS 16, 64<br>
</blockquote></div><br></div>