[x265] [PATCH] asm: interp_4tap_horiz_pX sse3 10-bit
chen
chenm003 at 163.com
Tue Jun 2 18:16:34 CEST 2015
movdqu -> movu
others fine
At 2015-06-03 00:02:36,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1433260747 25200
># Node ID fcfba27ecf0b9dac8da123da8cdcac75763496f3
># Parent 0f0d88319f7cc96661eef3c3dcc1befcf60354f3
>asm: interp_4tap_horiz_pX sse3 10-bit
>
>This replaces c code for all of 4tap_horiz pp and ps.
>
>64-bit
>
>./test/TestBench --testbench interp | grep chroma_hp;bp
>chroma_hpp[ 4x4] 2.23x 854.98 1902.48
>chroma_hps[ 4x4] 2.05x 1354.99 2778.18
>chroma_hpp[ 8x8] 2.57x 3235.00 8320.13
>chroma_hps[ 8x8] 2.14x 3982.52 8520.44
>chroma_hpp[16x16] 2.58x 12467.41 32145.56
>chroma_hps[16x16] 2.08x 13478.99 28010.00
>chroma_hpp[32x32] 2.85x 49318.75 140794.05
>chroma_hps[32x32] 5.19x 49005.44 254361.41
>chroma_hpp[ 4x2] 2.22x 457.50 1014.99
>chroma_hps[ 4x2] 2.01x 1004.99 2020.98
>chroma_hpp[ 2x4] 2.12x 512.50 1087.49
>chroma_hps[ 2x4] 1.85x 820.00 1520.58
>chroma_hpp[ 8x4] 2.55x 1655.00 4225.89
>chroma_hps[ 8x4] 2.09x 2572.48 5365.30
>chroma_hpp[ 4x8] 2.56x 1625.01 4167.64
>chroma_hps[ 4x8] 2.16x 2057.50 4449.99
>chroma_hpp[ 16x8] 2.56x 6275.00 16070.69
>chroma_hps[ 16x8] 2.11x 7840.00 16520.78
>chroma_hpp[ 8x16] 2.62x 6395.00 16779.55
>chroma_hps[ 8x16] 2.13x 6832.50 14520.42
>chroma_hpp[32x16] 2.84x 24707.68 70049.97
>chroma_hps[32x16] 5.19x 26659.50 138467.00
>chroma_hpp[16x32] 2.58x 24917.14 64318.72
>chroma_hps[16x32] 2.07x 24819.08 51370.22
>chroma_hpp[ 8x6] 2.57x 2450.00 6293.12
>chroma_hps[ 8x6] 2.10x 3282.56 6900.30
>chroma_hpp[ 6x8] 2.57x 2429.24 6240.71
>chroma_hps[ 6x8] 2.07x 3160.13 6550.00
>chroma_hpp[ 8x2] 2.19x 872.52 1909.97
>chroma_hps[ 8x2] 2.06x 1877.50 3865.74
>chroma_hpp[ 2x8] 2.16x 927.49 2005.01
>chroma_hps[ 2x8] 2.01x 1227.50 2470.27
>chroma_hpp[16x12] 2.58x 9371.36 24152.49
>chroma_hps[16x12] 2.10x 10665.00 22398.23
>chroma_hpp[12x16] 2.65x 9215.00 24465.62
>chroma_hps[12x16] 2.13x 10122.57 21550.70
>chroma_hpp[ 16x4] 2.54x 3170.00 8065.96
>chroma_hps[ 16x4] 2.06x 5020.00 10350.29
>chroma_hpp[ 4x16] 2.60x 3175.00 8270.00
>chroma_hps[ 4x16] 2.17x 3450.00 7490.13
>chroma_hpp[32x24] 2.85x 36976.00 105213.45
>chroma_hps[32x24] 5.27x 37801.41 199137.09
>chroma_hpp[24x32] 2.88x 37093.77 106880.20
>chroma_hps[24x32] 5.24x 36958.88 193609.19
>chroma_hpp[ 32x8] 2.84x 12360.07 35049.98
>chroma_hps[ 32x8] 5.24x 15477.46 81162.80
>chroma_hpp[ 8x32] 2.59x 12721.46 32910.11
>chroma_hps[ 8x32] 2.12x 12527.55 26520.00
>chroma_hpp[ 4x8] 2.56x 1627.50 4165.00
>chroma_hps[ 4x8] 2.16x 2057.50 4449.98
>chroma_hpp[ 8x16] 2.59x 6405.00 16590.00
>chroma_hps[ 8x16] 2.12x 6847.50 14520.00
>chroma_hpp[16x32] 2.58x 24927.11 64341.25
>chroma_hps[16x32] 2.07x 24812.88 51371.34
>chroma_hpp[32x64] 2.86x 98547.48 281379.03
>chroma_hps[32x64] 5.26x 93765.83 493587.72
>chroma_hpp[ 4x4] 2.21x 859.99 1900.00
>chroma_hps[ 4x4] 2.06x 1350.00 2780.58
>chroma_hpp[ 2x8] 2.16x 927.50 2007.47
>chroma_hps[ 2x8] 2.01x 1227.50 2469.99
>chroma_hpp[ 8x8] 2.57x 3240.00 8317.64
>chroma_hps[ 8x8] 2.13x 3992.50 8520.16
>chroma_hpp[ 4x16] 2.60x 3180.10 8270.48
>chroma_hps[ 4x16] 2.17x 3457.50 7490.13
>chroma_hpp[16x16] 2.58x 12480.00 32143.02
>chroma_hps[16x16] 2.08x 13485.08 28010.00
>chroma_hpp[ 8x32] 2.59x 12722.10 32910.00
>chroma_hps[ 8x32] 2.12x 12527.69 26520.00
>chroma_hpp[32x32] 2.85x 49298.53 140412.20
>chroma_hps[32x32] 5.23x 49040.32 256335.03
>chroma_hpp[16x64] 2.58x 49773.44 128440.18
>chroma_hps[16x64] 2.07x 47499.16 98369.34
>chroma_hpp[ 8x12] 2.60x 4820.00 12510.08
>chroma_hps[ 8x12] 2.12x 5427.50 11520.14
>chroma_hpp[ 6x16] 2.60x 4787.50 12430.28
>chroma_hps[ 6x16] 2.06x 5396.98 11110.14
>chroma_hpp[ 8x4] 2.54x 1660.01 4223.30
>chroma_hps[ 8x4] 2.09x 2572.50 5365.00
>chroma_hpp[ 2x16] 2.56x 1795.00 4590.00
>chroma_hps[ 2x16] 1.99x 2045.00 4070.41
>chroma_hpp[16x24] 2.58x 18678.77 48220.21
>chroma_hps[16x24] 2.07x 19167.97 39690.28
>chroma_hpp[12x32] 2.65x 18382.11 48780.00
>chroma_hps[12x32] 2.13x 18610.71 39617.65
>chroma_hpp[ 16x8] 2.56x 6280.00 16067.94
>chroma_hps[ 16x8] 2.11x 7840.00 16521.46
>chroma_hpp[ 4x32] 2.60x 6260.00 16270.26
>chroma_hps[ 4x32] 2.17x 6257.50 13569.99
>chroma_hpp[32x48] 2.85x 73959.77 211129.89
>chroma_hps[32x48] 5.25x 71408.45 374964.56
>chroma_hpp[24x64] 2.89x 74166.45 214010.20
>chroma_hps[24x64] 5.26x 70630.15 371645.31
>chroma_hpp[32x16] 2.85x 24694.61 70257.82
>chroma_hps[32x16] 5.17x 26643.62 137760.22
>chroma_hpp[ 8x64] 2.59x 25427.29 65796.38
>chroma_hps[ 8x64] 2.12x 23935.77 50771.55
>chroma_hpp[ 4x4] 2.21x 860.00 1900.00
>chroma_hps[ 4x4] 2.06x 1349.99 2780.30
>chroma_hpp[ 8x8] 2.57x 3240.00 8317.98
>chroma_hps[ 8x8] 2.13x 3992.53 8520.13
>chroma_hpp[16x16] 2.58x 12479.99 32142.50
>chroma_hps[16x16] 2.08x 13480.09 28010.00
>chroma_hpp[32x32] 2.85x 49309.57 140403.56
>chroma_hps[32x32] 5.25x 49011.53 257550.23
>chroma_hpp[64x64] 2.38x 231651.14 551080.75
>chroma_hps[64x64] 4.38x 223849.25 980566.25
>chroma_hpp[ 8x4] 2.54x 1664.99 4223.43
>chroma_hps[ 8x4] 2.09x 2572.53 5365.68
>chroma_hpp[ 4x8] 2.56x 1627.50 4165.15
>chroma_hps[ 4x8] 2.16x 2057.50 4449.99
>chroma_hpp[ 16x8] 2.56x 6270.00 16067.78
>chroma_hps[ 16x8] 2.08x 7845.10 16330.42
>chroma_hpp[ 8x16] 2.59x 6400.06 16590.34
>chroma_hps[ 8x16] 2.13x 6832.48 14520.28
>chroma_hpp[32x16] 2.83x 24749.50 70050.38
>chroma_hps[32x16] 5.23x 26652.20 139517.83
>chroma_hpp[16x32] 2.58x 24927.05 64321.18
>chroma_hps[16x32] 2.05x 25078.26 51370.84
>chroma_hpp[64x32] 2.81x 98087.36 275238.38
>chroma_hps[64x32] 5.24x 97654.80 511554.97
>chroma_hpp[32x64] 2.85x 98561.22 281336.09
>chroma_hps[32x64] 5.24x 93774.05 491252.31
>chroma_hpp[16x12] 2.58x 9370.03 24150.00
>chroma_hps[16x12] 2.08x 10664.99 22170.42
>chroma_hpp[12x16] 2.65x 9215.23 24463.34
>chroma_hps[12x16] 2.13x 10122.50 21550.28
>chroma_hpp[ 16x4] 2.54x 3180.00 8063.44
>chroma_hps[ 16x4] 2.06x 5020.00 10350.29
>chroma_hpp[ 4x16] 2.60x 3175.00 8270.13
>chroma_hps[ 4x16] 2.17x 3450.00 7490.27
>chroma_hpp[32x24] 2.84x 37023.25 105046.36
>chroma_hps[32x24] 5.14x 38119.38 195780.92
>chroma_hpp[24x32] 2.87x 37152.77 106763.15
>chroma_hps[24x32] 5.27x 37088.59 195467.23
>chroma_hpp[ 32x8] 2.84x 12360.00 35050.15
>chroma_hps[ 32x8] 5.27x 15434.85 81414.41
>chroma_hpp[ 8x32] 2.59x 12720.11 32910.00
>chroma_hps[ 8x32] 2.07x 12805.16 26520.33
>chroma_hpp[64x48] 2.81x 147267.20 413479.91
>chroma_hps[64x48] 5.26x 142341.56 748049.62
>chroma_hpp[48x64] 2.83x 147596.70 417422.91
>chroma_hps[48x64] 5.19x 140484.31 729314.38
>chroma_hpp[64x16] 2.80x 49059.62 137424.61
>chroma_hps[64x16] 5.24x 52911.45 277367.06
>chroma_hpp[16x64] 2.57x 49811.68 128246.03
>chroma_hps[16x64] 2.07x 47422.34 98353.62
>
>diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Jun 02 12:35:53 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Jun 02 08:59:07 2015 -0700
>@@ -935,6 +935,15 @@
>
> p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> }
>+ if (cpuMask & X265_CPU_SSE3)
>+ {
>+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+ ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+ ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+ ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+ }
> if (cpuMask & X265_CPU_SSSE3)
> {
> p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
>diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm Tue Jun 02 12:35:53 2015 +0530
>+++ b/source/common/x86/ipfilter16.asm Tue Jun 02 08:59:07 2015 -0700
>@@ -564,6 +564,395 @@
> FILTER_VER_LUMA_sse2 ps, 64, 16
> FILTER_VER_LUMA_sse2 ps, 16, 64
>
>+%macro FILTERH_W2_4_sse3 2
>+ movh m3, [r0 + %1]
>+ movhps m3, [r0 + %1 + 2]
>+ pmaddwd m3, m0
>+ movh m4, [r0 + r1 + %1]
>+ movhps m4, [r0 + r1 + %1 + 2]
>+ pmaddwd m4, m0
>+ pshufd m2, m3, q2301
>+ paddd m3, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m3, m3, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m1
>+ movh m5, [r0 + 2 * r1 + %1]
>+ movhps m5, [r0 + 2 * r1 + %1 + 2]
>+ pmaddwd m5, m0
>+ movh m4, [r0 + r4 + %1]
>+ movhps m4, [r0 + r4 + %1 + 2]
>+ pmaddwd m4, m0
>+ pshufd m2, m5, q2301
>+ paddd m5, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m5, m5, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m5, m4
>+ paddd m5, m1
>+%ifidn %2, pp
>+ psrad m3, 6
>+ psrad m5, 6
>+ packssdw m3, m5
>+ CLIPW m3, m7, m6
>+%else
>+ psrad m3, 2
>+ psrad m5, 2
>+ packssdw m3, m5
>+%endif
>+ movd [r2 + %1], m3
>+ psrldq m3, 4
>+ movd [r2 + r3 + %1], m3
>+ psrldq m3, 4
>+ movd [r2 + r3 * 2 + %1], m3
>+ psrldq m3, 4
>+ movd [r2 + r5 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W2_3_sse3 1
>+ movh m3, [r0 + %1]
>+ movhps m3, [r0 + %1 + 2]
>+ pmaddwd m3, m0
>+ movh m4, [r0 + r1 + %1]
>+ movhps m4, [r0 + r1 + %1 + 2]
>+ pmaddwd m4, m0
>+ pshufd m2, m3, q2301
>+ paddd m3, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m3, m3, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m1
>+
>+ movh m5, [r0 + 2 * r1 + %1]
>+ movhps m5, [r0 + 2 * r1 + %1 + 2]
>+ pmaddwd m5, m0
>+
>+ pshufd m2, m5, q2301
>+ paddd m5, m2
>+ pshufd m5, m5, q3120
>+ paddd m5, m1
>+
>+ psrad m3, 2
>+ psrad m5, 2
>+ packssdw m3, m5
>+
>+ movd [r2 + %1], m3
>+ psrldq m3, 4
>+ movd [r2 + r3 + %1], m3
>+ psrldq m3, 4
>+ movd [r2 + r3 * 2 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W4_2_sse3 2
>+ movh m3, [r0 + %1]
>+ movhps m3, [r0 + %1 + 2]
>+ pmaddwd m3, m0
>+ movh m4, [r0 + %1 + 4]
>+ movhps m4, [r0 + %1 + 6]
>+ pmaddwd m4, m0
>+ pshufd m2, m3, q2301
>+ paddd m3, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m3, m3, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m1
>+
>+ movh m5, [r0 + r1 + %1]
>+ movhps m5, [r0 + r1 + %1 + 2]
>+ pmaddwd m5, m0
>+ movh m4, [r0 + r1 + %1 + 4]
>+ movhps m4, [r0 + r1 + %1 + 6]
>+ pmaddwd m4, m0
>+ pshufd m2, m5, q2301
>+ paddd m5, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m5, m5, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m5, m4
>+ paddd m5, m1
>+%ifidn %2, pp
>+ psrad m3, 6
>+ psrad m5, 6
>+ packssdw m3, m5
>+ CLIPW m3, m7, m6
>+%else
>+ psrad m3, 2
>+ psrad m5, 2
>+ packssdw m3, m5
>+%endif
>+ movh [r2 + %1], m3
>+ movhps [r2 + r3 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W4_1_sse3 1
>+ movh m3, [r0 + 2 * r1 + %1]
>+ movhps m3, [r0 + 2 * r1 + %1 + 2]
>+ pmaddwd m3, m0
>+ movh m4, [r0 + 2 * r1 + %1 + 4]
>+ movhps m4, [r0 + 2 * r1 + %1 + 6]
>+ pmaddwd m4, m0
>+ pshufd m2, m3, q2301
>+ paddd m3, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m3, m3, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m1
>+
>+ psrad m3, 2
>+ packssdw m3, m3
>+ movh [r2 + r3 * 2 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W8_1_sse3 2
>+ movh m3, [r0 + %1]
>+ movhps m3, [r0 + %1 + 2]
>+ pmaddwd m3, m0
>+ movh m4, [r0 + %1 + 4]
>+ movhps m4, [r0 + %1 + 6]
>+ pmaddwd m4, m0
>+ pshufd m2, m3, q2301
>+ paddd m3, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m3, m3, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m1
>+
>+ movh m5, [r0 + %1 + 8]
>+ movhps m5, [r0 + %1 + 10]
>+ pmaddwd m5, m0
>+ movh m4, [r0 + %1 + 12]
>+ movhps m4, [r0 + %1 + 14]
>+ pmaddwd m4, m0
>+ pshufd m2, m5, q2301
>+ paddd m5, m2
>+ pshufd m2, m4, q2301
>+ paddd m4, m2
>+ pshufd m5, m5, q3120
>+ pshufd m4, m4, q3120
>+ punpcklqdq m5, m4
>+ paddd m5, m1
>+%ifidn %2, pp
>+ psrad m3, 6
>+ psrad m5, 6
>+ packssdw m3, m5
>+ CLIPW m3, m7, m6
>+%else
>+ psrad m3, 2
>+ psrad m5, 2
>+ packssdw m3, m5
>+%endif
>+ movdqu [r2 + %1], m3
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_HOR_CHROMA_sse3 3
>+INIT_XMM sse3
>+cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8
>+ add r3, r3
>+ add r1, r1
>+ sub r0, 2
>+ mov r4d, r4m
>+ add r4d, r4d
>+
>+%ifdef PIC
>+ lea r6, [tab_ChromaCoeff]
>+ movddup m0, [r6 + r4 * 4]
>+%else
>+ movddup m0, [tab_ChromaCoeff + r4 * 4]
>+%endif
>+
>+%ifidn %3, ps
>+ mova m1, [tab_c_n32768]
>+ cmp r5m, byte 0
>+%if %1 <= 6
>+ lea r4, [r1 * 3]
>+ lea r5, [r3 * 3]
>+%endif
>+ je .skip
>+ sub r0, r1
>+%if %1 <= 6
>+%assign y 1
>+%else
>+%assign y 3
>+%endif
>+%assign z 0
>+%rep y
>+%assign x 0
>+%rep %1/8
>+ FILTERH_W8_1_sse3 x, %3
>+%assign x x+16
>+%endrep
>+%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0)
>+ FILTERH_W4_2_sse3 x, %3
>+ FILTERH_W4_1_sse3 x
>+%assign x x+8
>+%endif
>+%if %1 == 2 || (%1 == 6 && z == 0)
>+ FILTERH_W2_3_sse3 x
>+%endif
>+%if %1 <= 6
>+ lea r0, [r0 + r4]
>+ lea r2, [r2 + r5]
>+%else
>+ lea r0, [r0 + r1]
>+ lea r2, [r2 + r3]
>+%endif
>+%assign z z+1
>+%endrep
>+.skip:
>+%elifidn %3, pp
>+ pxor m7, m7
>+ mova m6, [pw_pixel_max]
>+ mova m1, [tab_c_32]
>+%if %1 == 2 || %1 == 6
>+ lea r4, [r1 * 3]
>+ lea r5, [r3 * 3]
>+%endif
>+%endif
>+
>+%if %1 == 2
>+%assign y %2/4
>+%elif %1 <= 6
>+%assign y %2/2
>+%else
>+%assign y %2
>+%endif
>+%assign z 0
>+%rep y
>+%assign x 0
>+%rep %1/8
>+ FILTERH_W8_1_sse3 x, %3
>+%assign x x+16
>+%endrep
>+%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0)
>+ FILTERH_W4_2_sse3 x, %3
>+%assign x x+8
>+%endif
>+%if %1 == 2 || (%1 == 6 && (z % 2) == 0)
>+ FILTERH_W2_4_sse3 x, %3
>+%endif
>+%assign z z+1
>+%if z < y
>+%if %1 == 2
>+ lea r0, [r0 + 4 * r1]
>+ lea r2, [r2 + 4 * r3]
>+%elif %1 <= 6
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+%else
>+ lea r0, [r0 + r1]
>+ lea r2, [r2 + r3]
>+%endif
>+%endif ;z < y
>+%endrep
>+
>+RET
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+
>+FILTER_HOR_CHROMA_sse3 2, 4, pp
>+FILTER_HOR_CHROMA_sse3 2, 8, pp
>+FILTER_HOR_CHROMA_sse3 2, 16, pp
>+FILTER_HOR_CHROMA_sse3 4, 2, pp
>+FILTER_HOR_CHROMA_sse3 4, 4, pp
>+FILTER_HOR_CHROMA_sse3 4, 8, pp
>+FILTER_HOR_CHROMA_sse3 4, 16, pp
>+FILTER_HOR_CHROMA_sse3 4, 32, pp
>+FILTER_HOR_CHROMA_sse3 6, 8, pp
>+FILTER_HOR_CHROMA_sse3 6, 16, pp
>+FILTER_HOR_CHROMA_sse3 8, 2, pp
>+FILTER_HOR_CHROMA_sse3 8, 4, pp
>+FILTER_HOR_CHROMA_sse3 8, 6, pp
>+FILTER_HOR_CHROMA_sse3 8, 8, pp
>+FILTER_HOR_CHROMA_sse3 8, 12, pp
>+FILTER_HOR_CHROMA_sse3 8, 16, pp
>+FILTER_HOR_CHROMA_sse3 8, 32, pp
>+FILTER_HOR_CHROMA_sse3 8, 64, pp
>+FILTER_HOR_CHROMA_sse3 12, 16, pp
>+FILTER_HOR_CHROMA_sse3 12, 32, pp
>+FILTER_HOR_CHROMA_sse3 16, 4, pp
>+FILTER_HOR_CHROMA_sse3 16, 8, pp
>+FILTER_HOR_CHROMA_sse3 16, 12, pp
>+FILTER_HOR_CHROMA_sse3 16, 16, pp
>+FILTER_HOR_CHROMA_sse3 16, 24, pp
>+FILTER_HOR_CHROMA_sse3 16, 32, pp
>+FILTER_HOR_CHROMA_sse3 16, 64, pp
>+FILTER_HOR_CHROMA_sse3 24, 32, pp
>+FILTER_HOR_CHROMA_sse3 24, 64, pp
>+FILTER_HOR_CHROMA_sse3 32, 8, pp
>+FILTER_HOR_CHROMA_sse3 32, 16, pp
>+FILTER_HOR_CHROMA_sse3 32, 24, pp
>+FILTER_HOR_CHROMA_sse3 32, 32, pp
>+FILTER_HOR_CHROMA_sse3 32, 48, pp
>+FILTER_HOR_CHROMA_sse3 32, 64, pp
>+FILTER_HOR_CHROMA_sse3 48, 64, pp
>+FILTER_HOR_CHROMA_sse3 64, 16, pp
>+FILTER_HOR_CHROMA_sse3 64, 32, pp
>+FILTER_HOR_CHROMA_sse3 64, 48, pp
>+FILTER_HOR_CHROMA_sse3 64, 64, pp
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+
>+FILTER_HOR_CHROMA_sse3 2, 4, ps
>+FILTER_HOR_CHROMA_sse3 2, 8, ps
>+FILTER_HOR_CHROMA_sse3 2, 16, ps
>+FILTER_HOR_CHROMA_sse3 4, 2, ps
>+FILTER_HOR_CHROMA_sse3 4, 4, ps
>+FILTER_HOR_CHROMA_sse3 4, 8, ps
>+FILTER_HOR_CHROMA_sse3 4, 16, ps
>+FILTER_HOR_CHROMA_sse3 4, 32, ps
>+FILTER_HOR_CHROMA_sse3 6, 8, ps
>+FILTER_HOR_CHROMA_sse3 6, 16, ps
>+FILTER_HOR_CHROMA_sse3 8, 2, ps
>+FILTER_HOR_CHROMA_sse3 8, 4, ps
>+FILTER_HOR_CHROMA_sse3 8, 6, ps
>+FILTER_HOR_CHROMA_sse3 8, 8, ps
>+FILTER_HOR_CHROMA_sse3 8, 12, ps
>+FILTER_HOR_CHROMA_sse3 8, 16, ps
>+FILTER_HOR_CHROMA_sse3 8, 32, ps
>+FILTER_HOR_CHROMA_sse3 8, 64, ps
>+FILTER_HOR_CHROMA_sse3 12, 16, ps
>+FILTER_HOR_CHROMA_sse3 12, 32, ps
>+FILTER_HOR_CHROMA_sse3 16, 4, ps
>+FILTER_HOR_CHROMA_sse3 16, 8, ps
>+FILTER_HOR_CHROMA_sse3 16, 12, ps
>+FILTER_HOR_CHROMA_sse3 16, 16, ps
>+FILTER_HOR_CHROMA_sse3 16, 24, ps
>+FILTER_HOR_CHROMA_sse3 16, 32, ps
>+FILTER_HOR_CHROMA_sse3 16, 64, ps
>+FILTER_HOR_CHROMA_sse3 24, 32, ps
>+FILTER_HOR_CHROMA_sse3 24, 64, ps
>+FILTER_HOR_CHROMA_sse3 32, 8, ps
>+FILTER_HOR_CHROMA_sse3 32, 16, ps
>+FILTER_HOR_CHROMA_sse3 32, 24, ps
>+FILTER_HOR_CHROMA_sse3 32, 32, ps
>+FILTER_HOR_CHROMA_sse3 32, 48, ps
>+FILTER_HOR_CHROMA_sse3 32, 64, ps
>+FILTER_HOR_CHROMA_sse3 48, 64, ps
>+FILTER_HOR_CHROMA_sse3 64, 16, ps
>+FILTER_HOR_CHROMA_sse3 64, 32, ps
>+FILTER_HOR_CHROMA_sse3 64, 48, ps
>+FILTER_HOR_CHROMA_sse3 64, 64, ps
>+
> ;------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;------------------------------------------------------------------------------------------------------------
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150603/cf6cc359/attachment-0001.html>
More information about the x265-devel
mailing list