[x265] [PATCH] asm: 10bpp code for chroma interpolation filters
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Wed Feb 26 10:29:28 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1393404493 -19800
# Wed Feb 26 14:18:13 2014 +0530
# Node ID 36f88a9db55e1ccd25e9cffc8560af89408fcc8f
# Parent b47fc23c75dfecd72d2c47b4e528d793228654be
asm: 10bpp code for chroma interpolation filters
diff -r b47fc23c75df -r 36f88a9db55e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Feb 26 14:18:13 2014 +0530
@@ -718,6 +718,68 @@
#define SETUP_INTRA_ANG32(mode, fno, cpu) \
p.intra_pred[BLOCK_32x32][mode] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
+#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].filter_vss[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].filter_vsp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu);
+
+#define CHROMA_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
+
+#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
+ p.chroma[X265_CSP_I420].filter_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu; \
+ p.chroma[X265_CSP_I420].filter_hps[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu;
+
+#define CHROMA_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
+
namespace x265 {
// private x265 namespace
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
@@ -802,6 +864,9 @@
CHROMA_BLOCKCOPY(_sse2);
LUMA_BLOCKCOPY(_sse2);
+ CHROMA_VERT_FILTERS(_sse2);
+ p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
+
p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
@@ -842,6 +907,8 @@
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
LUMA_FILTERS(_sse4);
+ CHROMA_HORIZ_FILTERS(_sse4);
+ CHROMA_VERT_FILTERS_SSE4(_sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
p.quant = x265_quant_sse4;
diff -r b47fc23c75df -r 36f88a9db55e source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/x86/ipfilter16.asm Wed Feb 26 14:18:13 2014 +0530
@@ -2,6 +2,7 @@
;* Copyright (C) 2013 x265 project
;*
;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
+;* Murugan Vairavel <murugan at multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -26,10 +27,50 @@
SECTION_RODATA 32
-tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
- dw -1, 4, -10, 58, 17, -5, 1, 0
- dw -1, 4, -11, 40, 40, -11, 4, -1
- dw 0, 1, -5, 17, 58, -10, 4, -1
+tab_c_32: times 4 dd 32
+tab_c_n32768: times 4 dd -32768
+tab_c_524800: times 4 dd 524800
+tab_c_n8192: times 8 dw -8192
+
+tab_Tm16: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
+
+tab_ChromaCoeff: dw 0, 64, 0, 0
+ dw -2, 58, 10, -2
+ dw -4, 54, 16, -2
+ dw -6, 46, 28, -4
+ dw -4, 36, 36, -4
+ dw -4, 28, 46, -6
+ dw -2, 16, 54, -4
+ dw -2, 10, 58, -2
+
+tab_ChromaCoeffV: times 4 dw 0, 64
+ times 4 dw 0, 0
+
+ times 4 dw -2, 58
+ times 4 dw 10, -2
+
+ times 4 dw -4, 54
+ times 4 dw 16, -2
+
+ times 4 dw -6, 46
+ times 4 dw 28, -4
+
+ times 4 dw -4, 36
+ times 4 dw 36, -4
+
+ times 4 dw -4, 28
+ times 4 dw 46, -6
+
+ times 4 dw -2, 16
+ times 4 dw 54, -4
+
+ times 4 dw -2, 10
+ times 4 dw 58, -2
+
+tab_LumaCoeff: dw 0, 0, 0, 64, 0, 0, 0, 0
+ dw -1, 4, -10, 58, 17, -5, 1, 0
+ dw -1, 4, -11, 40, 40, -11, 4, -1
+ dw 0, 1, -5, 17, 58, -10, 4, -1
SECTION .text
@@ -721,3 +762,1423 @@
; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
;----------------------------------------------------------------------------------------------------------------------------
FILTER_HOR_LUMA_W24 24, 32, ps
+
+%macro FILTER_W2_2 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + r1]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+%ifidn %1, pp
+ psrad m3, 6
+ packusdw m3, m3
+ CLIPW m3, m7, m6
+%else
+ psrad m3, 2
+ packssdw m3, m3
+%endif
+ movd [r2], m3
+ pextrd [r2 + r3], m3, 1
+%endmacro
+
+%macro FILTER_W4_2 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + r1]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + r1 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m7, m6
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + r3], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_CHROMA_H 6
+INIT_XMM sse4
+cglobal interp_4tap_horiz_%3_%1x%2, 4, %4, %5
+
+ add r3, r3
+ add r1, r1
+ sub r0, 2
+ mov r4d, r4m
+ add r4d, r4d
+
+%ifdef PIC
+ lea r%6, [tab_ChromaCoeff]
+ movh m0, [r%6 + r4 * 4]
+%else
+ movh m0, [tab_ChromaCoeff1 + r4 * 4]
+%endif
+
+ punpcklqdq m0, m0
+ mova m2, [tab_Tm16]
+
+%ifidn %3, ps
+ mova m1, [tab_c_n32768]
+ cmp r5m, byte 0
+ je .skip
+ sub r0, r1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+
+ %if %1 == 4
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ %else
+ phaddd m3, m3
+ %endif
+
+ paddd m3, m1
+ psrad m3, 2
+ packssdw m3, m3
+
+ %if %1 == 2
+ movd [r2], m3
+ %else
+ movh [r2], m3
+ %endif
+
+ add r0, r1
+ add r2, r3
+ FILTER_W%1_2 %3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+.skip:
+
+%else ;%ifidn %3, ps
+ pxor m7, m7
+ mova m6, [pw_pixel_max]
+ mova m1, [tab_c_32]
+%endif ;%ifidn %3, ps
+
+ FILTER_W%1_2 %3
+
+%rep (%2/2) - 1
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ FILTER_W%1_2 %3
+%endrep
+
+RET
+%endmacro
+
+FILTER_CHROMA_H 2, 4, pp, 6, 8, 5
+FILTER_CHROMA_H 2, 8, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 2, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 4, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 8, pp, 6, 8, 5
+FILTER_CHROMA_H 4, 16, pp, 6, 8, 5
+
+FILTER_CHROMA_H 2, 4, ps, 7, 5, 6
+FILTER_CHROMA_H 2, 8, ps, 7, 5, 6
+FILTER_CHROMA_H 4, 2, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 4, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 8, ps, 7, 6, 6
+FILTER_CHROMA_H 4, 16, ps, 7, 6, 6
+
+%macro FILTER_W6_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m4, [r0 + 8]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m4, m4
+ paddd m4, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m4, 6
+ packusdw m3, m4
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m4, 2
+ packssdw m3, m4
+%endif
+ movh [r2], m3
+ pextrd [r2 + 8], m3, 2
+%endmacro
+
+cglobal chroma_filter_pp_6x1_internal
+ FILTER_W6_1 pp
+ ret
+
+cglobal chroma_filter_ps_6x1_internal
+ FILTER_W6_1 ps
+ ret
+
+%macro FILTER_W8_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + 8], m3
+%endmacro
+
+cglobal chroma_filter_pp_8x1_internal
+ FILTER_W8_1 pp
+ ret
+
+cglobal chroma_filter_ps_8x1_internal
+ FILTER_W8_1 ps
+ ret
+
+%macro FILTER_W12_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + 8], m3
+
+ movu m3, [r0 + 16]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 20]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+%ifidn %1, pp
+ psrad m3, 6
+ packusdw m3, m3
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ packssdw m3, m3
+%endif
+ movh [r2 + 16], m3
+%endmacro
+
+cglobal chroma_filter_pp_12x1_internal
+ FILTER_W12_1 pp
+ ret
+
+cglobal chroma_filter_ps_12x1_internal
+ FILTER_W12_1 ps
+ ret
+
+%macro FILTER_W16_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + 8], m3
+
+ movu m3, [r0 + 16]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 20]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 24]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 28]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 16], m3
+ movhps [r2 + 24], m3
+%endmacro
+
+cglobal chroma_filter_pp_16x1_internal
+ FILTER_W16_1 pp
+ ret
+
+cglobal chroma_filter_ps_16x1_internal
+ FILTER_W16_1 ps
+ ret
+
+%macro FILTER_W24_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + 8], m3
+
+ movu m3, [r0 + 16]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 20]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 24]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 28]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 16], m3
+ movhps [r2 + 24], m3
+
+ movu m3, [r0 + 32]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 36]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 40]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 44]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 32], m3
+ movhps [r2 + 40], m3
+%endmacro
+
+cglobal chroma_filter_pp_24x1_internal
+ FILTER_W24_1 pp
+ ret
+
+cglobal chroma_filter_ps_24x1_internal
+ FILTER_W24_1 ps
+ ret
+
+%macro FILTER_W32_1 1
+ movu m3, [r0]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 4]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 8]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 12]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2], m3
+ movhps [r2 + 8], m3
+
+ movu m3, [r0 + 16]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 20]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 24]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 28]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 16], m3
+ movhps [r2 + 24], m3
+
+ movu m3, [r0 + 32]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 36]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 40]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 44]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 32], m3
+ movhps [r2 + 40], m3
+
+ movu m3, [r0 + 48]
+ pshufb m3, m3, m2
+ pmaddwd m3, m0
+ movu m4, [r0 + 52]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m3, m4
+ paddd m3, m1
+
+ movu m5, [r0 + 56]
+ pshufb m5, m5, m2
+ pmaddwd m5, m0
+ movu m4, [r0 + 60]
+ pshufb m4, m4, m2
+ pmaddwd m4, m0
+ phaddd m5, m4
+ paddd m5, m1
+%ifidn %1, pp
+ psrad m3, 6
+ psrad m5, 6
+ packusdw m3, m5
+ CLIPW m3, m6, m7
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + 48], m3
+ movhps [r2 + 56], m3
+%endmacro
+
+cglobal chroma_filter_pp_32x1_internal
+ FILTER_W32_1 pp
+ ret
+
+cglobal chroma_filter_ps_32x1_internal
+ FILTER_W32_1 ps
+ ret
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+INIT_XMM sse4
+%macro IPFILTER_CHROMA 6
+cglobal interp_4tap_horiz_%3_%1x%2, 4, %5, %6
+
+ add r3, r3
+ add r1, r1
+ sub r0, 2
+ mov r4d, r4m
+ add r4d, r4d
+
+%ifdef PIC
+ lea r%4, [tab_ChromaCoeff]
+ movh m0, [r%4 + r4 * 4]
+%else
+ movh m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+ punpcklqdq m0, m0
+ mova m2, [tab_Tm16]
+
+%ifidn %3, ps
+ mova m1, [tab_c_n32768]
+ cmp r5m, byte 0
+ je .skip
+ sub r0, r1
+ call chroma_filter_%3_%1x1_internal
+ add r0, r1
+ add r2, r3
+ call chroma_filter_%3_%1x1_internal
+ add r0, r1
+ add r2, r3
+ call chroma_filter_%3_%1x1_internal
+ add r0, r1
+ add r2, r3
+.skip:
+%else
+ mova m1, [tab_c_32]
+ pxor m6, m6
+ mova m7, [pw_pixel_max]
+%endif
+
+ call chroma_filter_%3_%1x1_internal
+%rep %2 - 1
+ add r0, r1
+ add r2, r3
+ call chroma_filter_%3_%1x1_internal
+%endrep
+RET
+%endmacro
+IPFILTER_CHROMA 6, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 2, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 4, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 6, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 8, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 12, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 4, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 12, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 16, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 24, 32, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 8, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 16, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 24, pp, 5, 6, 8
+IPFILTER_CHROMA 32, 32, pp, 5, 6, 8
+
+IPFILTER_CHROMA 6, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 2, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 4, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 6, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 8, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 12, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 4, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 12, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 16, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 24, 32, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 8, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 16, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 24, ps, 6, 7, 6
+IPFILTER_CHROMA 32, 32, ps, 6, 7, 6
+
+
+%macro PROCESS_CHROMA_SP_W4_4R 0
+ movq m0, [r0]
+ movq m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r6 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m1, m4 ;m1=[1 2]
+ pmaddwd m1, [r6 + 0 *16] ;m1=[1+2] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[2 3]
+ pmaddwd m2, m4, [r6 + 0 *16] ;m2=[2+3] Row3
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m0, m4 ;m0=[0+1+2+3] Row1 done
+
+ lea r0, [r0 + 2 * r1]
+ movq m4, [r0]
+ punpcklwd m5, m4 ;m5=[3 4]
+ pmaddwd m3, m5, [r6 + 0 *16] ;m3=[3+4] Row4
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m1, m5 ;m1 = [1+2+3+4] Row2
+
+ movq m5, [r0 + r1]
+ punpcklwd m4, m5 ;m4=[4 5]
+ pmaddwd m4, [r6 + 1 * 16]
+ paddd m2, m4 ;m2=[2+3+4+5] Row3
+
+ movq m4, [r0 + 2 * r1]
+ punpcklwd m5, m4 ;m5=[5 6]
+ pmaddwd m5, [r6 + 1 * 16]
+ paddd m3, m5 ;m3=[3+4+5+6] Row4
+%endmacro
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS 4
+INIT_XMM sse2
+cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-1
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov byte [rsp], %2/4
+
+%ifnidn %3, ss
+ %ifnidn %3, ps
+ mova m7, [pw_pixel_max]
+ %ifidn %3, pp
+ mova m6, [tab_c_32]
+ %else
+ mova m6, [tab_c_524800]
+ %endif
+ %else
+ mova m6, [tab_c_n32768]
+ %endif
+%endif
+
+.loopH
+ mov r4d, (%1/4)
+.loopW
+ PROCESS_CHROMA_SP_W4_4R
+
+%ifidn %3, ss
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+%elifidn %3, ps
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+ psrad m0, 2
+ psrad m1, 2
+ psrad m2, 2
+ psrad m3, 2
+
+ packssdw m0, m1
+ packssdw m2, m3
+%else
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+ %ifidn %3, pp
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+ %else
+ psrad m0, 10
+ psrad m1, 10
+ psrad m2, 10
+ psrad m3, 10
+ %endif
+ packssdw m0, m1
+ packssdw m2, m3
+ pxor m5, m5
+ CLIPW m0, m5, m7
+ CLIPW m2, m5, m7
+%endif
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ dec r4d
+ jnz .loopW
+
+ lea r0, [r0 + 4 * r1 - 2 * %1]
+ lea r2, [r2 + 4 * r3 - 2 * %1]
+
+ dec byte [rsp]
+ jnz .loopH
+
+ RET
+%endmacro
+
+ FILTER_VER_CHROMA_SS 4, 4, ss, 6
+ FILTER_VER_CHROMA_SS 4, 8, ss, 6
+ FILTER_VER_CHROMA_SS 16, 16, ss, 6
+ FILTER_VER_CHROMA_SS 16, 8, ss, 6
+ FILTER_VER_CHROMA_SS 16, 12, ss, 6
+ FILTER_VER_CHROMA_SS 12, 16, ss, 6
+ FILTER_VER_CHROMA_SS 16, 4, ss, 6
+ FILTER_VER_CHROMA_SS 4, 16, ss, 6
+ FILTER_VER_CHROMA_SS 32, 32, ss, 6
+ FILTER_VER_CHROMA_SS 32, 16, ss, 6
+ FILTER_VER_CHROMA_SS 16, 32, ss, 6
+ FILTER_VER_CHROMA_SS 32, 24, ss, 6
+ FILTER_VER_CHROMA_SS 24, 32, ss, 6
+ FILTER_VER_CHROMA_SS 32, 8, ss, 6
+
+ FILTER_VER_CHROMA_SS 4, 4, ps, 7
+ FILTER_VER_CHROMA_SS 4, 8, ps, 7
+ FILTER_VER_CHROMA_SS 16, 16, ps, 7
+ FILTER_VER_CHROMA_SS 16, 8, ps, 7
+ FILTER_VER_CHROMA_SS 16, 12, ps, 7
+ FILTER_VER_CHROMA_SS 12, 16, ps, 7
+ FILTER_VER_CHROMA_SS 16, 4, ps, 7
+ FILTER_VER_CHROMA_SS 4, 16, ps, 7
+ FILTER_VER_CHROMA_SS 32, 32, ps, 7
+ FILTER_VER_CHROMA_SS 32, 16, ps, 7
+ FILTER_VER_CHROMA_SS 16, 32, ps, 7
+ FILTER_VER_CHROMA_SS 32, 24, ps, 7
+ FILTER_VER_CHROMA_SS 24, 32, ps, 7
+ FILTER_VER_CHROMA_SS 32, 8, ps, 7
+
+ FILTER_VER_CHROMA_SS 4, 4, sp, 8
+ FILTER_VER_CHROMA_SS 4, 8, sp, 8
+ FILTER_VER_CHROMA_SS 16, 16, sp, 8
+ FILTER_VER_CHROMA_SS 16, 8, sp, 8
+ FILTER_VER_CHROMA_SS 16, 12, sp, 8
+ FILTER_VER_CHROMA_SS 12, 16, sp, 8
+ FILTER_VER_CHROMA_SS 16, 4, sp, 8
+ FILTER_VER_CHROMA_SS 4, 16, sp, 8
+ FILTER_VER_CHROMA_SS 32, 32, sp, 8
+ FILTER_VER_CHROMA_SS 32, 16, sp, 8
+ FILTER_VER_CHROMA_SS 16, 32, sp, 8
+ FILTER_VER_CHROMA_SS 32, 24, sp, 8
+ FILTER_VER_CHROMA_SS 24, 32, sp, 8
+ FILTER_VER_CHROMA_SS 32, 8, sp, 8
+
+ FILTER_VER_CHROMA_SS 4, 4, pp, 8
+ FILTER_VER_CHROMA_SS 4, 8, pp, 8
+ FILTER_VER_CHROMA_SS 16, 16, pp, 8
+ FILTER_VER_CHROMA_SS 16, 8, pp, 8
+ FILTER_VER_CHROMA_SS 16, 12, pp, 8
+ FILTER_VER_CHROMA_SS 12, 16, pp, 8
+ FILTER_VER_CHROMA_SS 16, 4, pp, 8
+ FILTER_VER_CHROMA_SS 4, 16, pp, 8
+ FILTER_VER_CHROMA_SS 32, 32, pp, 8
+ FILTER_VER_CHROMA_SS 32, 16, pp, 8
+ FILTER_VER_CHROMA_SS 16, 32, pp, 8
+ FILTER_VER_CHROMA_SS 32, 24, pp, 8
+ FILTER_VER_CHROMA_SS 24, 32, pp, 8
+ FILTER_VER_CHROMA_SS 32, 8, pp, 8
+
+%macro PROCESS_CHROMA_SP_W2_4R 1
+ movd m0, [r0]
+ movd m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+
+ lea r0, [r0 + 2 * r1]
+ movd m2, [r0]
+ punpcklwd m1, m2 ;m1=[1 2]
+ punpcklqdq m0, m1 ;m0=[0 1 1 2]
+ pmaddwd m0, [%1 + 0 *16] ;m0=[0+1 1+2] Row 1-2
+
+ movd m1, [r0 + r1]
+ punpcklwd m2, m1 ;m2=[2 3]
+
+ lea r0, [r0 + 2 * r1]
+ movd m3, [r0]
+ punpcklwd m1, m3 ;m2=[3 4]
+ punpcklqdq m2, m1 ;m2=[2 3 3 4]
+
+ pmaddwd m4, m2, [%1 + 1 * 16] ;m4=[2+3 3+4] Row 1-2
+ pmaddwd m2, [%1 + 0 * 16] ;m2=[2+3 3+4] Row 3-4
+ paddd m0, m4 ;m0=[0+1+2+3 1+2+3+4] Row 1-2
+
+ movd m1, [r0 + r1]
+ punpcklwd m3, m1 ;m3=[4 5]
+
+ movd m4, [r0 + 2 * r1]
+ punpcklwd m1, m4 ;m1=[5 6]
+ punpcklqdq m3, m1 ;m2=[4 5 5 6]
+ pmaddwd m3, [%1 + 1 * 16] ;m3=[4+5 5+6] Row 3-4
+ paddd m2, m3 ;m2=[2+3+4+5 3+4+5+6] Row 3-4
+%endmacro
+
+;---------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_%2_2x%1(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W2 3
+INIT_XMM sse4
+cglobal interp_4tap_vert_%2_2x%1, 5, 6, %3
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, (%1/4)
+%ifnidn %2, ss
+ %ifnidn %2, ps
+ pxor m7, m7
+ mova m6, [pw_pixel_max]
+ %ifidn %2, pp
+ mova m5, [tab_c_32]
+ %else
+ mova m5, [tab_c_524800]
+ %endif
+ %else
+ mova m5, [tab_c_n32768]
+ %endif
+%endif
+
+.loopH
+ PROCESS_CHROMA_SP_W2_4R r5
+%ifidn %2, ss
+ psrad m0, 6
+ psrad m2, 6
+ packssdw m0, m2
+%elifidn %2, ps
+ paddd m0, m5
+ paddd m2, m5
+ psrad m0, 2
+ psrad m2, 2
+ packssdw m0, m2
+%else
+ paddd m0, m5
+ paddd m2, m5
+ %ifidn %2, pp
+ psrad m0, 6
+ psrad m2, 6
+ %else
+ psrad m0, 10
+ psrad m2, 10
+ %endif
+ packusdw m0, m2
+ CLIPW m0, m7, m6
+%endif
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m0, 2
+ pextrd [r2 + r3], m0, 3
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_W2 4, ss, 5
+FILTER_VER_CHROMA_W2 8, ss, 5
+
+FILTER_VER_CHROMA_W2 4, pp, 8
+FILTER_VER_CHROMA_W2 8, pp, 8
+
+FILTER_VER_CHROMA_W2 4, ps, 6
+FILTER_VER_CHROMA_W2 8, ps, 6
+
+FILTER_VER_CHROMA_W2 4, sp, 8
+FILTER_VER_CHROMA_W2 8, sp, 8
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_%1_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W4 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_%1_4x2, 5, 6, %2
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+%ifnidn %1, ss
+ %ifnidn %1, ps
+ pxor m6, m6
+ mova m5, [pw_pixel_max]
+ %ifidn %1, pp
+ mova m4, [tab_c_32]
+ %else
+ mova m4, [tab_c_524800]
+ %endif
+ %else
+ mova m4, [tab_c_n32768]
+ %endif
+%endif
+
+ movh m0, [r0]
+ movh m1, [r0 + r1]
+ punpcklwd m0, m1 ;m0=[0 1]
+ pmaddwd m0, [r5 + 0 *16] ;m0=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movh m2, [r0]
+ punpcklwd m1, m2 ;m1=[1 2]
+ pmaddwd m1, [r5 + 0 *16] ;m1=[1+2] Row2
+
+ movh m3, [r0 + r1]
+ punpcklwd m2, m3 ;m4=[2 3]
+ pmaddwd m2, [r5 + 1 * 16]
+ paddd m0, m2 ;m0=[0+1+2+3] Row1 done
+
+ movh m2, [r0 + 2 * r1]
+ punpcklwd m3, m2 ;m5=[3 4]
+ pmaddwd m3, [r5 + 1 * 16]
+ paddd m1, m3 ;m1=[1+2+3+4] Row2 done
+
+%ifidn %1, ss
+ psrad m0, 6
+ psrad m1, 6
+ packssdw m0, m1
+%elifidn %1, ps
+ paddd m0, m4
+ paddd m1, m4
+ psrad m0, 2
+ psrad m1, 2
+ packssdw m0, m1
+%else
+ paddd m0, m4
+ paddd m1, m4
+ %ifidn %1, pp
+ psrad m0, 6
+ psrad m1, 6
+ %else
+ psrad m0, 10
+ psrad m1, 10
+ %endif
+ packusdw m0, m1
+ CLIPW m0, m6, m5
+%endif
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_W4 ss, 4
+FILTER_VER_CHROMA_W4 pp, 7
+FILTER_VER_CHROMA_W4 ps, 5
+FILTER_VER_CHROMA_W4 sp, 7
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_%1_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W6 2
+INIT_XMM sse4
+cglobal interp_4tap_vert_%1_6x8, 5, 7, %2
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r6, [r5 + r4]
+%else
+ lea r6, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, 8/4
+
+%ifnidn %1, ss
+ %ifnidn %1, ps
+ mova m7, [pw_pixel_max]
+ %ifidn %1, pp
+ mova m6, [tab_c_32]
+ %else
+ mova m6, [tab_c_524800]
+ %endif
+ %else
+ mova m6, [tab_c_n32768]
+ %endif
+%endif
+
+.loopH
+ PROCESS_CHROMA_SP_W4_4R
+
+%ifidn %1, ss
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+%elifidn %1, ps
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+ psrad m0, 2
+ psrad m1, 2
+ psrad m2, 2
+ psrad m3, 2
+
+ packssdw m0, m1
+ packssdw m2, m3
+%else
+ paddd m0, m6
+ paddd m1, m6
+ paddd m2, m6
+ paddd m3, m6
+ %ifidn %1, pp
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+ %else
+ psrad m0, 10
+ psrad m1, 10
+ psrad m2, 10
+ psrad m3, 10
+ %endif
+ packssdw m0, m1
+ packssdw m2, m3
+ pxor m5, m5
+ CLIPW m0, m5, m7
+ CLIPW m2, m5, m7
+%endif
+
+ movh [r2], m0
+ movhps [r2 + r3], m0
+ lea r5, [r2 + 2 * r3]
+ movh [r5], m2
+ movhps [r5 + r3], m2
+
+ lea r5, [4 * r1 - 2 * 4]
+ sub r0, r5
+ add r2, 2 * 4
+
+ PROCESS_CHROMA_SP_W2_4R r6
+
+%ifidn %1, ss
+ psrad m0, 6
+ psrad m2, 6
+ packssdw m0, m2
+%elifidn %1, ps
+ paddd m0, m6
+ paddd m2, m6
+ psrad m0, 2
+ psrad m2, 2
+ packssdw m0, m2
+%else
+ paddd m0, m6
+ paddd m2, m6
+ %ifidn %1, pp
+ psrad m0, 6
+ psrad m2, 6
+ %else
+ psrad m0, 10
+ psrad m2, 10
+ %endif
+ packusdw m0, m2
+ CLIPW m0, m5, m7
+%endif
+
+ movd [r2], m0
+ pextrd [r2 + r3], m0, 1
+ lea r2, [r2 + 2 * r3]
+ pextrd [r2], m0, 2
+ pextrd [r2 + r3], m0, 3
+
+ sub r0, 2 * 4
+ lea r2, [r2 + 2 * r3 - 2 * 4]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_W6 ss, 6
+FILTER_VER_CHROMA_W6 ps, 7
+FILTER_VER_CHROMA_W6 sp, 8
+FILTER_VER_CHROMA_W6 pp, 8
+
+%macro PROCESS_CHROMA_SP_W8_2R 0
+ movu m1, [r0]
+ movu m3, [r0 + r1]
+ punpcklwd m0, m1, m3
+ pmaddwd m0, [r5 + 0 * 16] ;m0 = [0l+1l] Row1l
+ punpckhwd m1, m3
+ pmaddwd m1, [r5 + 0 * 16] ;m1 = [0h+1h] Row1h
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m2, m3, m4
+ pmaddwd m2, [r5 + 0 * 16] ;m2 = [1l+2l] Row2l
+ punpckhwd m3, m4
+ pmaddwd m3, [r5 + 0 * 16] ;m3 = [1h+2h] Row2h
+
+ lea r0, [r0 + 2 * r1]
+ movu m5, [r0 + r1]
+ punpcklwd m6, m4, m5
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [2l+3l] Row1l
+ paddd m0, m6 ;m0 = [0l+1l+2l+3l] Row1l sum
+ punpckhwd m4, m5
+ pmaddwd m4, [r5 + 1 * 16] ;m6 = [2h+3h] Row1h
+ paddd m1, m4 ;m1 = [0h+1h+2h+3h] Row1h sum
+
+ movu m4, [r0 + 2 * r1]
+ punpcklwd m6, m5, m4
+ pmaddwd m6, [r5 + 1 * 16] ;m6 = [3l+4l] Row2l
+ paddd m2, m6 ;m2 = [1l+2l+3l+4l] Row2l sum
+ punpckhwd m5, m4
+ pmaddwd m5, [r5 + 1 * 16] ;m1 = [3h+4h] Row2h
+ paddd m3, m5 ;m3 = [1h+2h+3h+4h] Row2h sum
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_%3_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W8 4
+INIT_XMM sse2
+cglobal interp_4tap_vert_%3_%1x%2, 5, 6, %4
+
+ add r1d, r1d
+ add r3d, r3d
+ sub r0, r1
+ shl r4d, 5
+
+%ifdef PIC
+ lea r5, [tab_ChromaCoeffV]
+ lea r5, [r5 + r4]
+%else
+ lea r5, [tab_ChromaCoeffV + r4]
+%endif
+
+ mov r4d, %2/2
+
+%ifidn %3, pp
+ mova m7, [tab_c_32]
+%elifidn %3, sp
+ mova m7, [tab_c_524800]
+%elifidn %3, ps
+ mova m7, [tab_c_n32768]
+%endif
+
+.loopH
+ PROCESS_CHROMA_SP_W8_2R
+
+%ifidn %3, ss
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+
+ packssdw m0, m1
+ packssdw m2, m3
+%elifidn %3, ps
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+ psrad m0, 2
+ psrad m1, 2
+ psrad m2, 2
+ psrad m3, 2
+
+ packssdw m0, m1
+ packssdw m2, m3
+%else
+ paddd m0, m7
+ paddd m1, m7
+ paddd m2, m7
+ paddd m3, m7
+ %ifidn %3, pp
+ psrad m0, 6
+ psrad m1, 6
+ psrad m2, 6
+ psrad m3, 6
+ %else
+ psrad m0, 10
+ psrad m1, 10
+ psrad m2, 10
+ psrad m3, 10
+ %endif
+ packssdw m0, m1
+ packssdw m2, m3
+ pxor m5, m5
+ mova m6, [pw_pixel_max]
+ CLIPW m0, m5, m6
+ CLIPW m2, m5, m6
+%endif
+
+ movu [r2], m0
+ movu [r2 + r3], m2
+
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loopH
+
+ RET
+%endmacro
+
+FILTER_VER_CHROMA_W8 8, 2, ss, 7
+FILTER_VER_CHROMA_W8 8, 4, ss, 7
+FILTER_VER_CHROMA_W8 8, 6, ss, 7
+FILTER_VER_CHROMA_W8 8, 8, ss, 7
+FILTER_VER_CHROMA_W8 8, 16, ss, 7
+FILTER_VER_CHROMA_W8 8, 32, ss, 7
+
+FILTER_VER_CHROMA_W8 8, 2, sp, 8
+FILTER_VER_CHROMA_W8 8, 4, sp, 8
+FILTER_VER_CHROMA_W8 8, 6, sp, 8
+FILTER_VER_CHROMA_W8 8, 8, sp, 8
+FILTER_VER_CHROMA_W8 8, 16, sp, 8
+FILTER_VER_CHROMA_W8 8, 32, sp, 8
+
+FILTER_VER_CHROMA_W8 8, 2, ps, 8
+FILTER_VER_CHROMA_W8 8, 4, ps, 8
+FILTER_VER_CHROMA_W8 8, 6, ps, 8
+FILTER_VER_CHROMA_W8 8, 8, ps, 8
+FILTER_VER_CHROMA_W8 8, 16, ps, 8
+FILTER_VER_CHROMA_W8 8, 32, ps, 8
+
+FILTER_VER_CHROMA_W8 8, 2, pp, 8
+FILTER_VER_CHROMA_W8 8, 4, pp, 8
+FILTER_VER_CHROMA_W8 8, 6, pp, 8
+FILTER_VER_CHROMA_W8 8, 8, pp, 8
+FILTER_VER_CHROMA_W8 8, 16, pp, 8
+FILTER_VER_CHROMA_W8 8, 32, pp, 8
+
+
+
+INIT_XMM sse2
+cglobal chroma_p2s, 3, 7, 3
+
+ ; load width and height
+ mov r3d, r3m
+ mov r4d, r4m
+ add r1, r1
+
+ ; load constant
+ mova m2, [tab_c_n8192]
+
+.loopH:
+
+ xor r5d, r5d
+.loopW:
+ lea r6, [r0 + r5 * 2]
+
+ movu m0, [r6]
+ psllw m0, 4
+ paddw m0, m2
+
+ movu m1, [r6 + r1]
+ psllw m1, 4
+ paddw m1, m2
+
+ add r5d, 8
+ cmp r5d, r3d
+ lea r6, [r2 + r5 * 2]
+ jg .width4
+ movu [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movu [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+ je .nextH
+ jmp .loopW
+
+.width4:
+ test r3d, 4
+ jz .width2
+ test r3d, 2
+ movh [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movh [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+ lea r6, [r6 + 8]
+ pshufd m0, m0, 2
+ pshufd m1, m1, 2
+ jz .nextH
+
+.width2:
+ movd [r6 + FENC_STRIDE / 2 * 0 - 16], m0
+ movd [r6 + FENC_STRIDE / 2 * 2 - 16], m1
+
+.nextH:
+ lea r0, [r0 + r1 * 2]
+ add r2, FENC_STRIDE / 2 * 4
+
+ sub r4d, 2
+ jnz .loopH
+
+ RET
diff -r b47fc23c75df -r 36f88a9db55e source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Wed Feb 26 03:26:00 2014 +0530
+++ b/source/common/x86/ipfilter8.h Wed Feb 26 14:18:13 2014 +0530
@@ -24,38 +24,6 @@
#ifndef X265_IPFILTER8_H
#define X265_IPFILTER8_H
-#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
- void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
- void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
- void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
-
-#define CHROMA_FILTERS(cpu) \
- SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
- SETUP_CHROMA_FUNC_DEF(2, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 6, cpu); \
- SETUP_CHROMA_FUNC_DEF(6, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 2, cpu); \
- SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \
- SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \
- SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
- SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \
- SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \
- SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
- SETUP_CHROMA_FUNC_DEF(8, 32, cpu)
-
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
void x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
void x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
@@ -149,6 +117,119 @@
SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
+#if HIGH_BIT_DEPTH
+
+#define SETUP_CHROMA_VERT_FUNC_DEF(W, H, cpu) \
+ void x265_interp_4tap_vert_ss_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+
+#define CHROMA_VERT_FILTERS(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(8, 32, cpu)
+
+#define CHROMA_VERT_FILTERS_SSE4(cpu) \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_VERT_FUNC_DEF(6, 8, cpu);
+
+#define SETUP_CHROMA_HORIZ_FUNC_DEF(W, H, cpu) \
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+#define CHROMA_HORIZ_FILTERS(cpu) \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_HORIZ_FUNC_DEF(8, 32, cpu)
+
+void x265_chroma_p2s_sse2(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
+CHROMA_VERT_FILTERS(_sse2);
+CHROMA_HORIZ_FILTERS(_sse4);
+CHROMA_VERT_FILTERS_SSE4(_sse4);
+
+#undef CHROMA_VERT_FILTERS_SSE4
+#undef CHROMA_VERT_FILTERS
+#undef SETUP_CHROMA_VERT_FUNC_DEF
+#undef CHROMA_HORIZ_FILTERS
+#undef SETUP_CHROMA_HORIZ_FUNC_DEF
+
+#else
+
+#define SETUP_CHROMA_FUNC_DEF(W, H, cpu) \
+ void x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_horiz_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx, int isRowExt); \
+ void x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx); \
+ void x265_interp_4tap_vert_ps_ ## W ## x ## H ## cpu(pixel * src, intptr_t srcStride, int16_t * dst, intptr_t dstStride, int coeffIdx);
+
+#define CHROMA_FILTERS(cpu) \
+ SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_FUNC_DEF(8, 32, cpu)
+
+
#define SETUP_CHROMA_SP_FUNC_DEF(W, H, cpu) \
void x265_interp_4tap_vert_sp_ ## W ## x ## H ## cpu(int16_t * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int coeffIdx);
@@ -216,28 +297,32 @@
CHROMA_SP_FILTERS_SSE4(_sse4);
CHROMA_SS_FILTERS(_sse2);
CHROMA_SS_FILTERS_SSE4(_sse4);
+
+void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+void x265_chroma_p2s_i444_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
+
+#undef SETUP_CHROMA_FUNC_DEF
+#undef SETUP_CHROMA_SP_FUNC_DEF
+#undef SETUP_CHROMA_SS_FUNC_DEF
+#undef CHROMA_FILTERS
+#undef CHROMA_SP_FILTERS
+#undef CHROMA_SS_FILTERS
+#undef CHROMA_SS_FILTERS_SSE4
+#undef CHROMA_SP_FILTERS_SSE4
+#endif
+
LUMA_FILTERS(_sse4);
LUMA_SP_FILTERS(_sse4);
LUMA_SS_FILTERS(_sse2);
void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
void x265_luma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-void x265_chroma_p2s_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-void x265_chroma_p2s_i444_ssse3(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height);
-#undef SETUP_CHROMA_FUNC_DEF
-#undef SETUP_CHROMA_SP_FUNC_DEF
-#undef SETUP_CHROMA_SS_FUNC_DEF
+#undef LUMA_FILTERS
+#undef LUMA_SP_FILTERS
+#undef LUMA_SS_FILTERS
#undef SETUP_LUMA_FUNC_DEF
#undef SETUP_LUMA_SP_FUNC_DEF
#undef SETUP_LUMA_SS_FUNC_DEF
-#undef CHROMA_FILTERS
-#undef CHROMA_SP_FILTERS
-#undef CHROMA_SS_FILTERS
-#undef LUMA_FILTERS
-#undef LUMA_SP_FILTERS
-#undef LUMA_SS_FILTERS
-#undef CHROMA_SS_FILTERS_SSE4
-#undef CHROMA_SP_FILTERS_SSE4
#endif // ifndef X265_MC_H
diff -r b47fc23c75df -r 36f88a9db55e source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Wed Feb 26 03:26:00 2014 +0530
+++ b/source/test/ipfilterharness.cpp Wed Feb 26 14:18:13 2014 +0530
@@ -169,7 +169,7 @@
for (int coeffIdx = 0; coeffIdx < 8; coeffIdx++)
{
- rand_srcStride = rand() % 100; // Randomly generated srcStride
+ rand_srcStride = rand() % 100 + 2; // Randomly generated srcStride
rand_dstStride = rand() % 100 + 32; // Randomly generated dstStride
opt(pixel_test_buff[index] + 3 * rand_srcStride,
@@ -237,7 +237,7 @@
{
for (int isRowExt = 0; isRowExt < 2; isRowExt++) // 0 : Interpolate W x H, 1 : Interpolate W x (H + 7)
{
- rand_srcStride = rand() % 100; // Randomly generated srcStride
+ rand_srcStride = rand() % 100 + 2; // Randomly generated srcStride
rand_dstStride = rand() % 100; // Randomly generated dstStride
ref(pixel_test_buff[index] + 3 * rand_srcStride,
More information about the x265-devel
mailing list