[x265] [PATCH] asm: interp_4tap_horiz_pX sse3 10-bit
dtyx265 at gmail.com
dtyx265 at gmail.com
Tue Jun 2 17:59:35 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433260747 25200
# Node ID fcfba27ecf0b9dac8da123da8cdcac75763496f3
# Parent 0f0d88319f7cc96661eef3c3dcc1befcf60354f3
asm: interp_4tap_horiz_pX sse3 10-bit
This replaces c code for all of 4tap_horiz pp and ps.
64-bit
./test/TestBench --testbench interp | grep chroma_hp;bp
chroma_hpp[ 4x4] 2.23x 854.98 1902.48
chroma_hps[ 4x4] 2.05x 1354.99 2778.18
chroma_hpp[ 8x8] 2.57x 3235.00 8320.13
chroma_hps[ 8x8] 2.14x 3982.52 8520.44
chroma_hpp[16x16] 2.58x 12467.41 32145.56
chroma_hps[16x16] 2.08x 13478.99 28010.00
chroma_hpp[32x32] 2.85x 49318.75 140794.05
chroma_hps[32x32] 5.19x 49005.44 254361.41
chroma_hpp[ 4x2] 2.22x 457.50 1014.99
chroma_hps[ 4x2] 2.01x 1004.99 2020.98
chroma_hpp[ 2x4] 2.12x 512.50 1087.49
chroma_hps[ 2x4] 1.85x 820.00 1520.58
chroma_hpp[ 8x4] 2.55x 1655.00 4225.89
chroma_hps[ 8x4] 2.09x 2572.48 5365.30
chroma_hpp[ 4x8] 2.56x 1625.01 4167.64
chroma_hps[ 4x8] 2.16x 2057.50 4449.99
chroma_hpp[ 16x8] 2.56x 6275.00 16070.69
chroma_hps[ 16x8] 2.11x 7840.00 16520.78
chroma_hpp[ 8x16] 2.62x 6395.00 16779.55
chroma_hps[ 8x16] 2.13x 6832.50 14520.42
chroma_hpp[32x16] 2.84x 24707.68 70049.97
chroma_hps[32x16] 5.19x 26659.50 138467.00
chroma_hpp[16x32] 2.58x 24917.14 64318.72
chroma_hps[16x32] 2.07x 24819.08 51370.22
chroma_hpp[ 8x6] 2.57x 2450.00 6293.12
chroma_hps[ 8x6] 2.10x 3282.56 6900.30
chroma_hpp[ 6x8] 2.57x 2429.24 6240.71
chroma_hps[ 6x8] 2.07x 3160.13 6550.00
chroma_hpp[ 8x2] 2.19x 872.52 1909.97
chroma_hps[ 8x2] 2.06x 1877.50 3865.74
chroma_hpp[ 2x8] 2.16x 927.49 2005.01
chroma_hps[ 2x8] 2.01x 1227.50 2470.27
chroma_hpp[16x12] 2.58x 9371.36 24152.49
chroma_hps[16x12] 2.10x 10665.00 22398.23
chroma_hpp[12x16] 2.65x 9215.00 24465.62
chroma_hps[12x16] 2.13x 10122.57 21550.70
chroma_hpp[ 16x4] 2.54x 3170.00 8065.96
chroma_hps[ 16x4] 2.06x 5020.00 10350.29
chroma_hpp[ 4x16] 2.60x 3175.00 8270.00
chroma_hps[ 4x16] 2.17x 3450.00 7490.13
chroma_hpp[32x24] 2.85x 36976.00 105213.45
chroma_hps[32x24] 5.27x 37801.41 199137.09
chroma_hpp[24x32] 2.88x 37093.77 106880.20
chroma_hps[24x32] 5.24x 36958.88 193609.19
chroma_hpp[ 32x8] 2.84x 12360.07 35049.98
chroma_hps[ 32x8] 5.24x 15477.46 81162.80
chroma_hpp[ 8x32] 2.59x 12721.46 32910.11
chroma_hps[ 8x32] 2.12x 12527.55 26520.00
chroma_hpp[ 4x8] 2.56x 1627.50 4165.00
chroma_hps[ 4x8] 2.16x 2057.50 4449.98
chroma_hpp[ 8x16] 2.59x 6405.00 16590.00
chroma_hps[ 8x16] 2.12x 6847.50 14520.00
chroma_hpp[16x32] 2.58x 24927.11 64341.25
chroma_hps[16x32] 2.07x 24812.88 51371.34
chroma_hpp[32x64] 2.86x 98547.48 281379.03
chroma_hps[32x64] 5.26x 93765.83 493587.72
chroma_hpp[ 4x4] 2.21x 859.99 1900.00
chroma_hps[ 4x4] 2.06x 1350.00 2780.58
chroma_hpp[ 2x8] 2.16x 927.50 2007.47
chroma_hps[ 2x8] 2.01x 1227.50 2469.99
chroma_hpp[ 8x8] 2.57x 3240.00 8317.64
chroma_hps[ 8x8] 2.13x 3992.50 8520.16
chroma_hpp[ 4x16] 2.60x 3180.10 8270.48
chroma_hps[ 4x16] 2.17x 3457.50 7490.13
chroma_hpp[16x16] 2.58x 12480.00 32143.02
chroma_hps[16x16] 2.08x 13485.08 28010.00
chroma_hpp[ 8x32] 2.59x 12722.10 32910.00
chroma_hps[ 8x32] 2.12x 12527.69 26520.00
chroma_hpp[32x32] 2.85x 49298.53 140412.20
chroma_hps[32x32] 5.23x 49040.32 256335.03
chroma_hpp[16x64] 2.58x 49773.44 128440.18
chroma_hps[16x64] 2.07x 47499.16 98369.34
chroma_hpp[ 8x12] 2.60x 4820.00 12510.08
chroma_hps[ 8x12] 2.12x 5427.50 11520.14
chroma_hpp[ 6x16] 2.60x 4787.50 12430.28
chroma_hps[ 6x16] 2.06x 5396.98 11110.14
chroma_hpp[ 8x4] 2.54x 1660.01 4223.30
chroma_hps[ 8x4] 2.09x 2572.50 5365.00
chroma_hpp[ 2x16] 2.56x 1795.00 4590.00
chroma_hps[ 2x16] 1.99x 2045.00 4070.41
chroma_hpp[16x24] 2.58x 18678.77 48220.21
chroma_hps[16x24] 2.07x 19167.97 39690.28
chroma_hpp[12x32] 2.65x 18382.11 48780.00
chroma_hps[12x32] 2.13x 18610.71 39617.65
chroma_hpp[ 16x8] 2.56x 6280.00 16067.94
chroma_hps[ 16x8] 2.11x 7840.00 16521.46
chroma_hpp[ 4x32] 2.60x 6260.00 16270.26
chroma_hps[ 4x32] 2.17x 6257.50 13569.99
chroma_hpp[32x48] 2.85x 73959.77 211129.89
chroma_hps[32x48] 5.25x 71408.45 374964.56
chroma_hpp[24x64] 2.89x 74166.45 214010.20
chroma_hps[24x64] 5.26x 70630.15 371645.31
chroma_hpp[32x16] 2.85x 24694.61 70257.82
chroma_hps[32x16] 5.17x 26643.62 137760.22
chroma_hpp[ 8x64] 2.59x 25427.29 65796.38
chroma_hps[ 8x64] 2.12x 23935.77 50771.55
chroma_hpp[ 4x4] 2.21x 860.00 1900.00
chroma_hps[ 4x4] 2.06x 1349.99 2780.30
chroma_hpp[ 8x8] 2.57x 3240.00 8317.98
chroma_hps[ 8x8] 2.13x 3992.53 8520.13
chroma_hpp[16x16] 2.58x 12479.99 32142.50
chroma_hps[16x16] 2.08x 13480.09 28010.00
chroma_hpp[32x32] 2.85x 49309.57 140403.56
chroma_hps[32x32] 5.25x 49011.53 257550.23
chroma_hpp[64x64] 2.38x 231651.14 551080.75
chroma_hps[64x64] 4.38x 223849.25 980566.25
chroma_hpp[ 8x4] 2.54x 1664.99 4223.43
chroma_hps[ 8x4] 2.09x 2572.53 5365.68
chroma_hpp[ 4x8] 2.56x 1627.50 4165.15
chroma_hps[ 4x8] 2.16x 2057.50 4449.99
chroma_hpp[ 16x8] 2.56x 6270.00 16067.78
chroma_hps[ 16x8] 2.08x 7845.10 16330.42
chroma_hpp[ 8x16] 2.59x 6400.06 16590.34
chroma_hps[ 8x16] 2.13x 6832.48 14520.28
chroma_hpp[32x16] 2.83x 24749.50 70050.38
chroma_hps[32x16] 5.23x 26652.20 139517.83
chroma_hpp[16x32] 2.58x 24927.05 64321.18
chroma_hps[16x32] 2.05x 25078.26 51370.84
chroma_hpp[64x32] 2.81x 98087.36 275238.38
chroma_hps[64x32] 5.24x 97654.80 511554.97
chroma_hpp[32x64] 2.85x 98561.22 281336.09
chroma_hps[32x64] 5.24x 93774.05 491252.31
chroma_hpp[16x12] 2.58x 9370.03 24150.00
chroma_hps[16x12] 2.08x 10664.99 22170.42
chroma_hpp[12x16] 2.65x 9215.23 24463.34
chroma_hps[12x16] 2.13x 10122.50 21550.28
chroma_hpp[ 16x4] 2.54x 3180.00 8063.44
chroma_hps[ 16x4] 2.06x 5020.00 10350.29
chroma_hpp[ 4x16] 2.60x 3175.00 8270.13
chroma_hps[ 4x16] 2.17x 3450.00 7490.27
chroma_hpp[32x24] 2.84x 37023.25 105046.36
chroma_hps[32x24] 5.14x 38119.38 195780.92
chroma_hpp[24x32] 2.87x 37152.77 106763.15
chroma_hps[24x32] 5.27x 37088.59 195467.23
chroma_hpp[ 32x8] 2.84x 12360.00 35050.15
chroma_hps[ 32x8] 5.27x 15434.85 81414.41
chroma_hpp[ 8x32] 2.59x 12720.11 32910.00
chroma_hps[ 8x32] 2.07x 12805.16 26520.33
chroma_hpp[64x48] 2.81x 147267.20 413479.91
chroma_hps[64x48] 5.26x 142341.56 748049.62
chroma_hpp[48x64] 2.83x 147596.70 417422.91
chroma_hps[48x64] 5.19x 140484.31 729314.38
chroma_hpp[64x16] 2.80x 49059.62 137424.61
chroma_hps[64x16] 5.24x 52911.45 277367.06
chroma_hpp[16x64] 2.57x 49811.68 128246.03
chroma_hps[16x64] 2.07x 47422.34 98353.62
diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 02 12:35:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 02 08:59:07 2015 -0700
@@ -935,6 +935,15 @@
p.frameInitLowres = x265_frame_init_lowres_core_sse2;
}
+ if (cpuMask & X265_CPU_SSE3)
+ {
+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+ ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+ }
if (cpuMask & X265_CPU_SSSE3)
{
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Tue Jun 02 12:35:53 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Tue Jun 02 08:59:07 2015 -0700
@@ -564,6 +564,395 @@
FILTER_VER_LUMA_sse2 ps, 64, 16
FILTER_VER_LUMA_sse2 ps, 16, 64
+%macro FILTERH_W2_4_sse3 2
+ movh m3, [r0 + %1]
+ movhps m3, [r0 + %1 + 2]
+ pmaddwd m3, m0
+ movh m4, [r0 + r1 + %1]
+ movhps m4, [r0 + r1 + %1 + 2]
+ pmaddwd m4, m0
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m3, m3, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m3, m4
+ paddd m3, m1
+ movh m5, [r0 + 2 * r1 + %1]
+ movhps m5, [r0 + 2 * r1 + %1 + 2]
+ pmaddwd m5, m0
+ movh m4, [r0 + r4 + %1]
+ movhps m4, [r0 + r4 + %1 + 2]
+ pmaddwd m4, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m5, m5, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m5, m4
+ paddd m5, m1
+%ifidn %2, pp
+ psrad m3, 6
+ psrad m5, 6
+ packssdw m3, m5
+ CLIPW m3, m7, m6
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movd [r2 + %1], m3
+ psrldq m3, 4
+ movd [r2 + r3 + %1], m3
+ psrldq m3, 4
+ movd [r2 + r3 * 2 + %1], m3
+ psrldq m3, 4
+ movd [r2 + r5 + %1], m3
+%endmacro
+
+%macro FILTERH_W2_3_sse3 1
+ movh m3, [r0 + %1]
+ movhps m3, [r0 + %1 + 2]
+ pmaddwd m3, m0
+ movh m4, [r0 + r1 + %1]
+ movhps m4, [r0 + r1 + %1 + 2]
+ pmaddwd m4, m0
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m3, m3, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m3, m4
+ paddd m3, m1
+
+ movh m5, [r0 + 2 * r1 + %1]
+ movhps m5, [r0 + 2 * r1 + %1 + 2]
+ pmaddwd m5, m0
+
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m5, m5, q3120
+ paddd m5, m1
+
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+
+ movd [r2 + %1], m3
+ psrldq m3, 4
+ movd [r2 + r3 + %1], m3
+ psrldq m3, 4
+ movd [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_2_sse3 2
+ movh m3, [r0 + %1]
+ movhps m3, [r0 + %1 + 2]
+ pmaddwd m3, m0
+ movh m4, [r0 + %1 + 4]
+ movhps m4, [r0 + %1 + 6]
+ pmaddwd m4, m0
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m3, m3, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m3, m4
+ paddd m3, m1
+
+ movh m5, [r0 + r1 + %1]
+ movhps m5, [r0 + r1 + %1 + 2]
+ pmaddwd m5, m0
+ movh m4, [r0 + r1 + %1 + 4]
+ movhps m4, [r0 + r1 + %1 + 6]
+ pmaddwd m4, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m5, m5, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m5, m4
+ paddd m5, m1
+%ifidn %2, pp
+ psrad m3, 6
+ psrad m5, 6
+ packssdw m3, m5
+ CLIPW m3, m7, m6
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movh [r2 + %1], m3
+ movhps [r2 + r3 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_1_sse3 1
+ movh m3, [r0 + 2 * r1 + %1]
+ movhps m3, [r0 + 2 * r1 + %1 + 2]
+ pmaddwd m3, m0
+ movh m4, [r0 + 2 * r1 + %1 + 4]
+ movhps m4, [r0 + 2 * r1 + %1 + 6]
+ pmaddwd m4, m0
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m3, m3, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m3, m4
+ paddd m3, m1
+
+ psrad m3, 2
+ packssdw m3, m3
+ movh [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W8_1_sse3 2
+ movh m3, [r0 + %1]
+ movhps m3, [r0 + %1 + 2]
+ pmaddwd m3, m0
+ movh m4, [r0 + %1 + 4]
+ movhps m4, [r0 + %1 + 6]
+ pmaddwd m4, m0
+ pshufd m2, m3, q2301
+ paddd m3, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m3, m3, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m3, m4
+ paddd m3, m1
+
+ movh m5, [r0 + %1 + 8]
+ movhps m5, [r0 + %1 + 10]
+ pmaddwd m5, m0
+ movh m4, [r0 + %1 + 12]
+ movhps m4, [r0 + %1 + 14]
+ pmaddwd m4, m0
+ pshufd m2, m5, q2301
+ paddd m5, m2
+ pshufd m2, m4, q2301
+ paddd m4, m2
+ pshufd m5, m5, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m5, m4
+ paddd m5, m1
+%ifidn %2, pp
+ psrad m3, 6
+ psrad m5, 6
+ packssdw m3, m5
+ CLIPW m3, m7, m6
+%else
+ psrad m3, 2
+ psrad m5, 2
+ packssdw m3, m5
+%endif
+ movdqu [r2 + %1], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_HOR_CHROMA_sse3 3
+INIT_XMM sse3
+cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8
+ add r3, r3
+ add r1, r1
+ sub r0, 2
+ mov r4d, r4m
+ add r4d, r4d
+
+%ifdef PIC
+ lea r6, [tab_ChromaCoeff]
+ movddup m0, [r6 + r4 * 4]
+%else
+ movddup m0, [tab_ChromaCoeff + r4 * 4]
+%endif
+
+%ifidn %3, ps
+ mova m1, [tab_c_n32768]
+ cmp r5m, byte 0
+%if %1 <= 6
+ lea r4, [r1 * 3]
+ lea r5, [r3 * 3]
+%endif
+ je .skip
+ sub r0, r1
+%if %1 <= 6
+%assign y 1
+%else
+%assign y 3
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+ FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0)
+ FILTERH_W4_2_sse3 x, %3
+ FILTERH_W4_1_sse3 x
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && z == 0)
+ FILTERH_W2_3_sse3 x
+%endif
+%if %1 <= 6
+ lea r0, [r0 + r4]
+ lea r2, [r2 + r5]
+%else
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+%endif
+%assign z z+1
+%endrep
+.skip:
+%elifidn %3, pp
+ pxor m7, m7
+ mova m6, [pw_pixel_max]
+ mova m1, [tab_c_32]
+%if %1 == 2 || %1 == 6
+ lea r4, [r1 * 3]
+ lea r5, [r3 * 3]
+%endif
+%endif
+
+%if %1 == 2
+%assign y %2/4
+%elif %1 <= 6
+%assign y %2/2
+%else
+%assign y %2
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+ FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0)
+ FILTERH_W4_2_sse3 x, %3
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && (z % 2) == 0)
+ FILTERH_W2_4_sse3 x, %3
+%endif
+%assign z z+1
+%if z < y
+%if %1 == 2
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+%elif %1 <= 6
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+%else
+ lea r0, [r0 + r1]
+ lea r2, [r2 + r3]
+%endif
+%endif ;z < y
+%endrep
+
+RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, pp
+FILTER_HOR_CHROMA_sse3 2, 8, pp
+FILTER_HOR_CHROMA_sse3 2, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 2, pp
+FILTER_HOR_CHROMA_sse3 4, 4, pp
+FILTER_HOR_CHROMA_sse3 4, 8, pp
+FILTER_HOR_CHROMA_sse3 4, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 32, pp
+FILTER_HOR_CHROMA_sse3 6, 8, pp
+FILTER_HOR_CHROMA_sse3 6, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 2, pp
+FILTER_HOR_CHROMA_sse3 8, 4, pp
+FILTER_HOR_CHROMA_sse3 8, 6, pp
+FILTER_HOR_CHROMA_sse3 8, 8, pp
+FILTER_HOR_CHROMA_sse3 8, 12, pp
+FILTER_HOR_CHROMA_sse3 8, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 32, pp
+FILTER_HOR_CHROMA_sse3 8, 64, pp
+FILTER_HOR_CHROMA_sse3 12, 16, pp
+FILTER_HOR_CHROMA_sse3 12, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 4, pp
+FILTER_HOR_CHROMA_sse3 16, 8, pp
+FILTER_HOR_CHROMA_sse3 16, 12, pp
+FILTER_HOR_CHROMA_sse3 16, 16, pp
+FILTER_HOR_CHROMA_sse3 16, 24, pp
+FILTER_HOR_CHROMA_sse3 16, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 64, pp
+FILTER_HOR_CHROMA_sse3 24, 32, pp
+FILTER_HOR_CHROMA_sse3 24, 64, pp
+FILTER_HOR_CHROMA_sse3 32, 8, pp
+FILTER_HOR_CHROMA_sse3 32, 16, pp
+FILTER_HOR_CHROMA_sse3 32, 24, pp
+FILTER_HOR_CHROMA_sse3 32, 32, pp
+FILTER_HOR_CHROMA_sse3 32, 48, pp
+FILTER_HOR_CHROMA_sse3 32, 64, pp
+FILTER_HOR_CHROMA_sse3 48, 64, pp
+FILTER_HOR_CHROMA_sse3 64, 16, pp
+FILTER_HOR_CHROMA_sse3 64, 32, pp
+FILTER_HOR_CHROMA_sse3 64, 48, pp
+FILTER_HOR_CHROMA_sse3 64, 64, pp
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, ps
+FILTER_HOR_CHROMA_sse3 2, 8, ps
+FILTER_HOR_CHROMA_sse3 2, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 2, ps
+FILTER_HOR_CHROMA_sse3 4, 4, ps
+FILTER_HOR_CHROMA_sse3 4, 8, ps
+FILTER_HOR_CHROMA_sse3 4, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 32, ps
+FILTER_HOR_CHROMA_sse3 6, 8, ps
+FILTER_HOR_CHROMA_sse3 6, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 2, ps
+FILTER_HOR_CHROMA_sse3 8, 4, ps
+FILTER_HOR_CHROMA_sse3 8, 6, ps
+FILTER_HOR_CHROMA_sse3 8, 8, ps
+FILTER_HOR_CHROMA_sse3 8, 12, ps
+FILTER_HOR_CHROMA_sse3 8, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 32, ps
+FILTER_HOR_CHROMA_sse3 8, 64, ps
+FILTER_HOR_CHROMA_sse3 12, 16, ps
+FILTER_HOR_CHROMA_sse3 12, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 4, ps
+FILTER_HOR_CHROMA_sse3 16, 8, ps
+FILTER_HOR_CHROMA_sse3 16, 12, ps
+FILTER_HOR_CHROMA_sse3 16, 16, ps
+FILTER_HOR_CHROMA_sse3 16, 24, ps
+FILTER_HOR_CHROMA_sse3 16, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 64, ps
+FILTER_HOR_CHROMA_sse3 24, 32, ps
+FILTER_HOR_CHROMA_sse3 24, 64, ps
+FILTER_HOR_CHROMA_sse3 32, 8, ps
+FILTER_HOR_CHROMA_sse3 32, 16, ps
+FILTER_HOR_CHROMA_sse3 32, 24, ps
+FILTER_HOR_CHROMA_sse3 32, 32, ps
+FILTER_HOR_CHROMA_sse3 32, 48, ps
+FILTER_HOR_CHROMA_sse3 32, 64, ps
+FILTER_HOR_CHROMA_sse3 48, 64, ps
+FILTER_HOR_CHROMA_sse3 64, 16, ps
+FILTER_HOR_CHROMA_sse3 64, 32, ps
+FILTER_HOR_CHROMA_sse3 64, 48, ps
+FILTER_HOR_CHROMA_sse3 64, 64, ps
+
;------------------------------------------------------------------------------------------------------------
; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list