[x265] [PATCH] asm: interp_8tap_vert_pX sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Wed May 27 03:44:16 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1432691029 25200
# Node ID 20df9b085d013253edf8d57e10d8eb1630d9927a
# Parent 8ddc790790a46de9ceadea388f6271acdb3012ed
asm: interp_8tap_vert_pX sse2
This replaces c code for all of interp_8tap_vert pp and ps.
64-bit
./test/TestBench --testbench interp | grep luma_vp;bp
luma_vpp[ 8x8] 4.05x 3365.34 13642.61
luma_vps[ 8x8] 3.39x 3405.18 11557.50
luma_vpp[16x16] 4.44x 13734.67 60968.89
luma_vps[16x16] 4.66x 12885.35 60093.25
luma_vpp[32x32] 4.38x 53248.80 233228.12
luma_vps[32x32] 4.52x 51850.27 234333.50
luma_vpp[64x64] 4.38x 211368.58 925816.75
luma_vps[64x64] 4.53x 205176.34 928840.56
luma_vpp[ 8x4] 3.95x 1794.99 7098.63
luma_vps[ 8x4] 3.60x 1682.49 6050.91
luma_vpp[ 4x8] 3.95x 1825.61 7205.84
luma_vps[ 4x8] 3.57x 1672.88 5971.07
luma_vpp[ 16x8] 4.58x 6665.79 30530.09
luma_vps[ 16x8] 4.71x 6485.83 30546.27
luma_vpp[ 8x16] 4.11x 6645.59 27331.23
luma_vps[ 8x16] 3.41x 6727.70 22927.42
luma_vpp[32x16] 4.43x 26393.79 116796.19
luma_vps[32x16] 4.57x 25664.11 117334.80
luma_vpp[16x32] 4.44x 27290.70 121266.76
luma_vps[16x32] 4.69x 25765.72 120752.98
luma_vpp[64x32] 4.36x 106282.16 463708.72
luma_vps[64x32] 4.57x 102739.57 469175.25
luma_vpp[32x64] 4.46x 105607.29 471000.06
luma_vps[32x64] 4.56x 102941.20 468978.53
luma_vpp[16x12] 4.58x 9966.30 45611.04
luma_vps[16x12] 4.66x 9685.46 45172.76
luma_vpp[12x16] 4.59x 10265.67 47102.20
luma_vps[12x16] 3.47x 9785.28 34002.64
luma_vpp[ 16x4] 4.57x 3499.96 16002.48
luma_vps[ 16x4] 4.50x 3394.99 15282.42
luma_vpp[ 4x16] 4.11x 3585.63 14732.50
luma_vps[ 4x16] 3.63x 3261.74 11837.95
luma_vpp[32x24] 4.43x 39517.40 175126.44
luma_vps[32x24] 4.50x 39051.13 175843.83
luma_vpp[24x32] 4.34x 40722.79 176931.92
luma_vps[24x32] 4.64x 38717.61 179836.58
luma_vpp[ 32x8] 4.45x 13204.99 58768.52
luma_vps[ 32x8] 4.59x 12835.27 58936.89
luma_vpp[ 8x32] 4.08x 13205.08 53922.46
luma_vps[ 8x32] 3.52x 12885.64 45357.47
luma_vpp[64x48] 4.38x 158803.11 695535.19
luma_vps[64x48] 4.50x 153975.67 692405.81
luma_vpp[48x64] 4.37x 158383.92 692080.56
luma_vps[48x64] 4.51x 154145.81 694486.56
luma_vpp[64x16] 4.27x 53708.59 229365.14
luma_vps[64x16] 4.45x 51893.83 230883.55
luma_vpp[16x64] 4.49x 54126.62 242815.41
luma_vps[16x64] 4.67x 51715.24 241677.61
diff -r 8ddc790790a4 -r 20df9b085d01 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 26 18:43:49 2015 -0700
@@ -1576,7 +1576,8 @@
ALL_CHROMA_420_PU(filter_vps, interp_4tap_vert_ps, sse2);
ALL_CHROMA_422_PU(filter_vps, interp_4tap_vert_ps, sse2);
ALL_CHROMA_444_PU(filter_vps, interp_4tap_vert_ps, sse2);
-
+ ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, sse2);
+ ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sse2);
#else
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_sse2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_sse2;
diff -r 8ddc790790a4 -r 20df9b085d01 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Tue May 26 18:43:49 2015 -0700
@@ -985,6 +985,365 @@
IPFILTER_LUMA_sse2 64, 16, ps
IPFILTER_LUMA_sse2 16, 64, ps
+%macro PROCESS_LUMA_W4_4R_sse2 0
+ movd m2, [r0]
+ movd m7, [r0 + r1]
+ punpcklbw m2, m7 ; m2=[0 1]
+
+ lea r0, [r0 + 2 * r1]
+ movd m3, [r0]
+ punpcklbw m7, m3 ; m7=[1 2]
+ punpcklbw m2, m0
+ punpcklbw m7, m0
+ pmaddwd m2, [r6 + 0 * 32]
+ pmaddwd m7, [r6 + 0 * 32]
+ packssdw m2, m7 ; m2=[0+1 1+2]
+
+ movd m7, [r0 + r1]
+ punpcklbw m3, m7 ; m3=[2 3]
+ lea r0, [r0 + 2 * r1]
+ movd m5, [r0]
+ punpcklbw m7, m5 ; m7=[3 4]
+ punpcklbw m3, m0
+ punpcklbw m7, m0
+ pmaddwd m4, m3, [r6 + 1 * 32]
+ pmaddwd m6, m7, [r6 + 1 * 32]
+ packssdw m4, m6 ; m4=[2+3 3+4]
+ paddw m2, m4 ; m2=[0+1+2+3 1+2+3+4] Row1-2
+ pmaddwd m3, [r6 + 0 * 32]
+ pmaddwd m7, [r6 + 0 * 32]
+ packssdw m3, m7 ; m3=[2+3 3+4] Row3-4
+
+ movd m7, [r0 + r1]
+ punpcklbw m5, m7 ; m5=[4 5]
+ lea r0, [r0 + 2 * r1]
+ movd m4, [r0]
+ punpcklbw m7, m4 ; m7=[5 6]
+ punpcklbw m5, m0
+ punpcklbw m7, m0
+ pmaddwd m6, m5, [r6 + 2 * 32]
+ pmaddwd m8, m7, [r6 + 2 * 32]
+ packssdw m6, m8 ; m6=[4+5 5+6]
+ paddw m2, m6 ; m2=[0+1+2+3+4+5 1+2+3+4+5+6] Row1-2
+ pmaddwd m5, [r6 + 1 * 32]
+ pmaddwd m7, [r6 + 1 * 32]
+ packssdw m5, m7 ; m5=[4+5 5+6]
+ paddw m3, m5 ; m3=[2+3+4+5 3+4+5+6] Row3-4
+
+ movd m7, [r0 + r1]
+ punpcklbw m4, m7 ; m4=[6 7]
+ lea r0, [r0 + 2 * r1]
+ movd m5, [r0]
+ punpcklbw m7, m5 ; m7=[7 8]
+ punpcklbw m4, m0
+ punpcklbw m7, m0
+ pmaddwd m6, m4, [r6 + 3 * 32]
+ pmaddwd m8, m7, [r6 + 3 * 32]
+ packssdw m6, m8 ; m7=[6+7 7+8]
+ paddw m2, m6 ; m2=[0+1+2+3+4+5+6+7 1+2+3+4+5+6+7+8] Row1-2 end
+ pmaddwd m4, [r6 + 2 * 32]
+ pmaddwd m7, [r6 + 2 * 32]
+ packssdw m4, m7 ; m4=[6+7 7+8]
+ paddw m3, m4 ; m3=[2+3+4+5+6+7 3+4+5+6+7+8] Row3-4
+
+ movd m7, [r0 + r1]
+ punpcklbw m5, m7 ; m5=[8 9]
+ movd m4, [r0 + 2 * r1]
+ punpcklbw m7, m4 ; m7=[9 10]
+ punpcklbw m5, m0
+ punpcklbw m7, m0
+ pmaddwd m5, [r6 + 3 * 32]
+ pmaddwd m7, [r6 + 3 * 32]
+ packssdw m5, m7 ; m5=[8+9 9+10]
+ paddw m3, m5 ; m3=[2+3+4+5+6+7+8+9 3+4+5+6+7+8+9+10] Row3-4 end
+%endmacro
+
+%macro PROCESS_LUMA_W8_4R_sse2 0
+ movq m7, [r0]
+ movq m6, [r0 + r1]
+ punpcklbw m7, m6
+ punpcklbw m2, m7, m0
+ punpckhbw m7, m0
+ pmaddwd m2, [r6 + 0 * 32]
+ pmaddwd m7, [r6 + 0 * 32]
+ packssdw m2, m7 ; m2=[0+1] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m7, [r0]
+ punpcklbw m6, m7
+ punpcklbw m3, m6, m0
+ punpckhbw m6, m0
+ pmaddwd m3, [r6 + 0 * 32]
+ pmaddwd m6, [r6 + 0 * 32]
+ packssdw m3, m6 ; m3=[1+2] Row2
+
+ movq m6, [r0 + r1]
+ punpcklbw m7, m6
+ punpckhbw m8, m7, m0
+ punpcklbw m7, m0
+ pmaddwd m4, m7, [r6 + 0 * 32]
+ pmaddwd m9, m8, [r6 + 0 * 32]
+ packssdw m4, m9 ; m4=[2+3] Row3
+ pmaddwd m7, [r6 + 1 * 32]
+ pmaddwd m8, [r6 + 1 * 32]
+ packssdw m7, m8
+ paddw m2, m7 ; m2=[0+1+2+3] Row1
+
+ lea r0, [r0 + 2 * r1]
+ movq m10, [r0]
+ punpcklbw m6, m10
+ punpckhbw m8, m6, m0
+ punpcklbw m6, m0
+ pmaddwd m5, m6, [r6 + 0 * 32]
+ pmaddwd m9, m8, [r6 + 0 * 32]
+ packssdw m5, m9 ; m5=[3+4] Row4
+ pmaddwd m6, [r6 + 1 * 32]
+ pmaddwd m8, [r6 + 1 * 32]
+ packssdw m6, m8
+ paddw m3, m6 ; m3 = [1+2+3+4] Row2
+
+ movq m6, [r0 + r1]
+ punpcklbw m10, m6
+ punpckhbw m8, m10, m0
+ punpcklbw m10, m0
+ pmaddwd m7, m10, [r6 + 1 * 32]
+ pmaddwd m9, m8, [r6 + 1 * 32]
+ packssdw m7, m9
+ pmaddwd m10, [r6 + 2 * 32]
+ pmaddwd m8, [r6 + 2 * 32]
+ packssdw m10, m8
+ paddw m2, m10 ; m2=[0+1+2+3+4+5] Row1
+ paddw m4, m7 ; m4=[2+3+4+5] Row3
+
+ lea r0, [r0 + 2 * r1]
+ movq m10, [r0]
+ punpcklbw m6, m10
+ punpckhbw m8, m6, m0
+ punpcklbw m6, m0
+ pmaddwd m7, m6, [r6 + 1 * 32]
+ pmaddwd m9, m8, [r6 + 1 * 32]
+ packssdw m7, m9
+ pmaddwd m6, [r6 + 2 * 32]
+ pmaddwd m8, [r6 + 2 * 32]
+ packssdw m6, m8
+ paddw m3, m6 ; m3=[1+2+3+4+5+6] Row2
+ paddw m5, m7 ; m5=[3+4+5+6] Row4
+
+ movq m6, [r0 + r1]
+ punpcklbw m10, m6
+ punpckhbw m8, m10, m0
+ punpcklbw m10, m0
+ pmaddwd m7, m10, [r6 + 2 * 32]
+ pmaddwd m9, m8, [r6 + 2 * 32]
+ packssdw m7, m9
+ pmaddwd m10, [r6 + 3 * 32]
+ pmaddwd m8, [r6 + 3 * 32]
+ packssdw m10, m8
+ paddw m2, m10 ; m2=[0+1+2+3+4+5+6+7] Row1 end
+ paddw m4, m7 ; m4=[2+3+4+5+6+7] Row3
+
+ lea r0, [r0 + 2 * r1]
+ movq m10, [r0]
+ punpcklbw m6, m10
+ punpckhbw m8, m6, m0
+ punpcklbw m6, m0
+ pmaddwd m7, m6, [r6 + 2 * 32]
+ pmaddwd m9, m8, [r6 + 2 * 32]
+ packssdw m7, m9
+ pmaddwd m6, [r6 + 3 * 32]
+ pmaddwd m8, [r6 + 3 * 32]
+ packssdw m6, m8
+ paddw m3, m6 ; m3=[1+2+3+4+5+6+7+8] Row2 end
+ paddw m5, m7 ; m5=[3+4+5+6+7+8] Row4
+
+ movq m6, [r0 + r1]
+ punpcklbw m10, m6
+ punpckhbw m8, m10, m0
+ punpcklbw m10, m0
+ pmaddwd m8, [r6 + 3 * 32]
+ pmaddwd m10, [r6 + 3 * 32]
+ packssdw m10, m8
+ paddw m4, m10 ; m4=[2+3+4+5+6+7+8+9] Row3 end
+
+ movq m10, [r0 + 2 * r1]
+ punpcklbw m6, m10
+ punpckhbw m8, m6, m0
+ punpcklbw m6, m0
+ pmaddwd m8, [r6 + 3 * 32]
+ pmaddwd m6, [r6 + 3 * 32]
+ packssdw m6, m8
+ paddw m5, m6 ; m5=[3+4+5+6+7+8+9+10] Row4 end
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_vert_%3_%1x%2, 5, 8, 11
+ lea r5, [3 * r1]
+ sub r0, r5
+ shl r4d, 7
+
+%ifdef PIC
+ lea r6, [pw_LumaCoeffVer]
+ add r6, r4
+%else
+ lea r6, [pw_LumaCoeffVer + r4]
+%endif
+
+%ifidn %3,pp
+ mova m1, [pw_32]
+%else
+ mova m1, [pw_2000]
+ add r3d, r3d
+%endif
+
+ mov r4d, %2/4
+ lea r5, [3 * r3]
+ pxor m0, m0
+
+.loopH:
+%assign x 0
+%rep (%1 / 8)
+ PROCESS_LUMA_W8_4R_sse2
+
+%ifidn %3,pp
+ paddw m2, m1
+ paddw m3, m1
+ paddw m4, m1
+ paddw m5, m1
+ psraw m2, 6
+ psraw m3, 6
+ psraw m4, 6
+ psraw m5, 6
+
+ packuswb m2, m3
+ packuswb m4, m5
+
+ movh [r2 + x], m2
+ movhps [r2 + r3 + x], m2
+ movh [r2 + 2 * r3 + x], m4
+ movhps [r2 + r5 + x], m4
+%else
+ psubw m2, m1
+ psubw m3, m1
+ psubw m4, m1
+ psubw m5, m1
+
+ movu [r2 + (2*x)], m2
+ movu [r2 + r3 + (2*x)], m3
+ movu [r2 + 2 * r3 + (2*x)], m4
+ movu [r2 + r5 + (2*x)], m5
+%endif
+%assign x x+8
+%if %1 > 8
+ lea r7, [8 * r1 - 8]
+ sub r0, r7
+%endif
+%endrep
+
+%rep (%1 % 8)/4
+ PROCESS_LUMA_W4_4R_sse2
+
+%ifidn %3,pp
+ paddw m2, m1
+ psraw m2, 6
+ paddw m3, m1
+ psraw m3, 6
+
+ packuswb m2, m3
+
+ movd [r2 + x], m2
+ psrldq m2, 4
+ movd [r2 + r3 + x], m2
+ psrldq m2, 4
+ movd [r2 + 2 * r3 + x], m2
+ psrldq m2, 4
+ movd [r2 + r5 + x], m2
+%else
+ psubw m2, m1
+ psubw m3, m1
+
+ movh [r2 + (2*x)], m2
+ movhps [r2 + r3 + (2*x)], m2
+ movh [r2 + 2 * r3 + (2*x)], m3
+ movhps [r2 + r5 + (2*x)], m3
+%endif
+%endrep
+
+ lea r2, [r2 + 4 * r3]
+%if %1 <= 8
+ lea r7, [4 * r1]
+ sub r0, r7
+%elif %1 == 12
+ lea r7, [4 * r1 + 8]
+ sub r0, r7
+%else
+ lea r0, [r0 + 4 * r1 - %1]
+%endif
+
+ dec r4d
+ jnz .loopH
+
+ RET
+
+%endmacro
+
+%if ARCH_X86_64
+ FILTER_VER_LUMA_sse2 4, 4, pp
+ FILTER_VER_LUMA_sse2 4, 8, pp
+ FILTER_VER_LUMA_sse2 4, 16, pp
+ FILTER_VER_LUMA_sse2 8, 4, pp
+ FILTER_VER_LUMA_sse2 8, 8, pp
+ FILTER_VER_LUMA_sse2 8, 16, pp
+ FILTER_VER_LUMA_sse2 8, 32, pp
+ FILTER_VER_LUMA_sse2 12, 16, pp
+ FILTER_VER_LUMA_sse2 16, 4, pp
+ FILTER_VER_LUMA_sse2 16, 8, pp
+ FILTER_VER_LUMA_sse2 16, 12, pp
+ FILTER_VER_LUMA_sse2 16, 16, pp
+ FILTER_VER_LUMA_sse2 16, 32, pp
+ FILTER_VER_LUMA_sse2 16, 64, pp
+ FILTER_VER_LUMA_sse2 24, 32, pp
+ FILTER_VER_LUMA_sse2 32, 8, pp
+ FILTER_VER_LUMA_sse2 32, 16, pp
+ FILTER_VER_LUMA_sse2 32, 24, pp
+ FILTER_VER_LUMA_sse2 32, 32, pp
+ FILTER_VER_LUMA_sse2 32, 64, pp
+ FILTER_VER_LUMA_sse2 48, 64, pp
+ FILTER_VER_LUMA_sse2 64, 16, pp
+ FILTER_VER_LUMA_sse2 64, 32, pp
+ FILTER_VER_LUMA_sse2 64, 48, pp
+ FILTER_VER_LUMA_sse2 64, 64, pp
+
+ FILTER_VER_LUMA_sse2 4, 4, ps
+ FILTER_VER_LUMA_sse2 4, 8, ps
+ FILTER_VER_LUMA_sse2 4, 16, ps
+ FILTER_VER_LUMA_sse2 8, 4, ps
+ FILTER_VER_LUMA_sse2 8, 8, ps
+ FILTER_VER_LUMA_sse2 8, 16, ps
+ FILTER_VER_LUMA_sse2 8, 32, ps
+ FILTER_VER_LUMA_sse2 12, 16, ps
+ FILTER_VER_LUMA_sse2 16, 4, ps
+ FILTER_VER_LUMA_sse2 16, 8, ps
+ FILTER_VER_LUMA_sse2 16, 12, ps
+ FILTER_VER_LUMA_sse2 16, 16, ps
+ FILTER_VER_LUMA_sse2 16, 32, ps
+ FILTER_VER_LUMA_sse2 16, 64, ps
+ FILTER_VER_LUMA_sse2 24, 32, ps
+ FILTER_VER_LUMA_sse2 32, 8, ps
+ FILTER_VER_LUMA_sse2 32, 16, ps
+ FILTER_VER_LUMA_sse2 32, 24, ps
+ FILTER_VER_LUMA_sse2 32, 32, ps
+ FILTER_VER_LUMA_sse2 32, 64, ps
+ FILTER_VER_LUMA_sse2 48, 64, ps
+ FILTER_VER_LUMA_sse2 64, 16, ps
+ FILTER_VER_LUMA_sse2 64, 32, ps
+ FILTER_VER_LUMA_sse2 64, 48, ps
+ FILTER_VER_LUMA_sse2 64, 64, ps
+%endif
+
%macro WORD_TO_DOUBLE 1
%if ARCH_X86_64
punpcklbw %1, m8
diff -r 8ddc790790a4 -r 20df9b085d01 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/ipfilter8.h Tue May 26 18:43:49 2015 -0700
@@ -1027,6 +1027,56 @@
void x265_interp_4tap_vert_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_4x4_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_4x8_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_4x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x4_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x8_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_8x32_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_12x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x4_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x8_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x12_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x32_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_16x64_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_24x32_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x8_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x24_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x32_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_32x64_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_48x64_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x16_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x32_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x48_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_pp_64x64_sse2(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_vert_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#endif
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
More information about the x265-devel
mailing list