[x265] [PATCH] asm: interp_4tap_horiz_pX sse3 10-bit

dtyx265 at gmail.com dtyx265 at gmail.com
Tue Jun 2 17:59:35 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433260747 25200
# Node ID fcfba27ecf0b9dac8da123da8cdcac75763496f3
# Parent  0f0d88319f7cc96661eef3c3dcc1befcf60354f3
asm: interp_4tap_horiz_pX sse3 10-bit

This replaces c code for all of 4tap_horiz pp and ps.

64-bit

./test/TestBench --testbench interp | grep chroma_hp;bp
chroma_hpp[  4x4]	2.23x 	 854.98   	 1902.48
chroma_hps[  4x4]	2.05x 	 1354.99  	 2778.18
chroma_hpp[  8x8]	2.57x 	 3235.00  	 8320.13
chroma_hps[  8x8]	2.14x 	 3982.52  	 8520.44
chroma_hpp[16x16]	2.58x 	 12467.41 	 32145.56
chroma_hps[16x16]	2.08x 	 13478.99 	 28010.00
chroma_hpp[32x32]	2.85x 	 49318.75 	 140794.05
chroma_hps[32x32]	5.19x 	 49005.44 	 254361.41
chroma_hpp[  4x2]	2.22x 	 457.50   	 1014.99
chroma_hps[  4x2]	2.01x 	 1004.99  	 2020.98
chroma_hpp[  2x4]	2.12x 	 512.50   	 1087.49
chroma_hps[  2x4]	1.85x 	 820.00   	 1520.58
chroma_hpp[  8x4]	2.55x 	 1655.00  	 4225.89
chroma_hps[  8x4]	2.09x 	 2572.48  	 5365.30
chroma_hpp[  4x8]	2.56x 	 1625.01  	 4167.64
chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.99
chroma_hpp[ 16x8]	2.56x 	 6275.00  	 16070.69
chroma_hps[ 16x8]	2.11x 	 7840.00  	 16520.78
chroma_hpp[ 8x16]	2.62x 	 6395.00  	 16779.55
chroma_hps[ 8x16]	2.13x 	 6832.50  	 14520.42
chroma_hpp[32x16]	2.84x 	 24707.68 	 70049.97
chroma_hps[32x16]	5.19x 	 26659.50 	 138467.00
chroma_hpp[16x32]	2.58x 	 24917.14 	 64318.72
chroma_hps[16x32]	2.07x 	 24819.08 	 51370.22
chroma_hpp[  8x6]	2.57x 	 2450.00  	 6293.12
chroma_hps[  8x6]	2.10x 	 3282.56  	 6900.30
chroma_hpp[  6x8]	2.57x 	 2429.24  	 6240.71
chroma_hps[  6x8]	2.07x 	 3160.13  	 6550.00
chroma_hpp[  8x2]	2.19x 	 872.52   	 1909.97
chroma_hps[  8x2]	2.06x 	 1877.50  	 3865.74
chroma_hpp[  2x8]	2.16x 	 927.49   	 2005.01
chroma_hps[  2x8]	2.01x 	 1227.50  	 2470.27
chroma_hpp[16x12]	2.58x 	 9371.36  	 24152.49
chroma_hps[16x12]	2.10x 	 10665.00 	 22398.23
chroma_hpp[12x16]	2.65x 	 9215.00  	 24465.62
chroma_hps[12x16]	2.13x 	 10122.57 	 21550.70
chroma_hpp[ 16x4]	2.54x 	 3170.00  	 8065.96
chroma_hps[ 16x4]	2.06x 	 5020.00  	 10350.29
chroma_hpp[ 4x16]	2.60x 	 3175.00  	 8270.00
chroma_hps[ 4x16]	2.17x 	 3450.00  	 7490.13
chroma_hpp[32x24]	2.85x 	 36976.00 	 105213.45
chroma_hps[32x24]	5.27x 	 37801.41 	 199137.09
chroma_hpp[24x32]	2.88x 	 37093.77 	 106880.20
chroma_hps[24x32]	5.24x 	 36958.88 	 193609.19
chroma_hpp[ 32x8]	2.84x 	 12360.07 	 35049.98
chroma_hps[ 32x8]	5.24x 	 15477.46 	 81162.80
chroma_hpp[ 8x32]	2.59x 	 12721.46 	 32910.11
chroma_hps[ 8x32]	2.12x 	 12527.55 	 26520.00
chroma_hpp[  4x8]	2.56x 	 1627.50  	 4165.00
chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.98
chroma_hpp[ 8x16]	2.59x 	 6405.00  	 16590.00
chroma_hps[ 8x16]	2.12x 	 6847.50  	 14520.00
chroma_hpp[16x32]	2.58x 	 24927.11 	 64341.25
chroma_hps[16x32]	2.07x 	 24812.88 	 51371.34
chroma_hpp[32x64]	2.86x 	 98547.48 	 281379.03
chroma_hps[32x64]	5.26x 	 93765.83 	 493587.72
chroma_hpp[  4x4]	2.21x 	 859.99   	 1900.00
chroma_hps[  4x4]	2.06x 	 1350.00  	 2780.58
chroma_hpp[  2x8]	2.16x 	 927.50   	 2007.47
chroma_hps[  2x8]	2.01x 	 1227.50  	 2469.99
chroma_hpp[  8x8]	2.57x 	 3240.00  	 8317.64
chroma_hps[  8x8]	2.13x 	 3992.50  	 8520.16
chroma_hpp[ 4x16]	2.60x 	 3180.10  	 8270.48
chroma_hps[ 4x16]	2.17x 	 3457.50  	 7490.13
chroma_hpp[16x16]	2.58x 	 12480.00 	 32143.02
chroma_hps[16x16]	2.08x 	 13485.08 	 28010.00
chroma_hpp[ 8x32]	2.59x 	 12722.10 	 32910.00
chroma_hps[ 8x32]	2.12x 	 12527.69 	 26520.00
chroma_hpp[32x32]	2.85x 	 49298.53 	 140412.20
chroma_hps[32x32]	5.23x 	 49040.32 	 256335.03
chroma_hpp[16x64]	2.58x 	 49773.44 	 128440.18
chroma_hps[16x64]	2.07x 	 47499.16 	 98369.34
chroma_hpp[ 8x12]	2.60x 	 4820.00  	 12510.08
chroma_hps[ 8x12]	2.12x 	 5427.50  	 11520.14
chroma_hpp[ 6x16]	2.60x 	 4787.50  	 12430.28
chroma_hps[ 6x16]	2.06x 	 5396.98  	 11110.14
chroma_hpp[  8x4]	2.54x 	 1660.01  	 4223.30
chroma_hps[  8x4]	2.09x 	 2572.50  	 5365.00
chroma_hpp[ 2x16]	2.56x 	 1795.00  	 4590.00
chroma_hps[ 2x16]	1.99x 	 2045.00  	 4070.41
chroma_hpp[16x24]	2.58x 	 18678.77 	 48220.21
chroma_hps[16x24]	2.07x 	 19167.97 	 39690.28
chroma_hpp[12x32]	2.65x 	 18382.11 	 48780.00
chroma_hps[12x32]	2.13x 	 18610.71 	 39617.65
chroma_hpp[ 16x8]	2.56x 	 6280.00  	 16067.94
chroma_hps[ 16x8]	2.11x 	 7840.00  	 16521.46
chroma_hpp[ 4x32]	2.60x 	 6260.00  	 16270.26
chroma_hps[ 4x32]	2.17x 	 6257.50  	 13569.99
chroma_hpp[32x48]	2.85x 	 73959.77 	 211129.89
chroma_hps[32x48]	5.25x 	 71408.45 	 374964.56
chroma_hpp[24x64]	2.89x 	 74166.45 	 214010.20
chroma_hps[24x64]	5.26x 	 70630.15 	 371645.31
chroma_hpp[32x16]	2.85x 	 24694.61 	 70257.82
chroma_hps[32x16]	5.17x 	 26643.62 	 137760.22
chroma_hpp[ 8x64]	2.59x 	 25427.29 	 65796.38
chroma_hps[ 8x64]	2.12x 	 23935.77 	 50771.55
chroma_hpp[  4x4]	2.21x 	 860.00   	 1900.00
chroma_hps[  4x4]	2.06x 	 1349.99  	 2780.30
chroma_hpp[  8x8]	2.57x 	 3240.00  	 8317.98
chroma_hps[  8x8]	2.13x 	 3992.53  	 8520.13
chroma_hpp[16x16]	2.58x 	 12479.99 	 32142.50
chroma_hps[16x16]	2.08x 	 13480.09 	 28010.00
chroma_hpp[32x32]	2.85x 	 49309.57 	 140403.56
chroma_hps[32x32]	5.25x 	 49011.53 	 257550.23
chroma_hpp[64x64]	2.38x 	 231651.14 	 551080.75
chroma_hps[64x64]	4.38x 	 223849.25 	 980566.25
chroma_hpp[  8x4]	2.54x 	 1664.99  	 4223.43
chroma_hps[  8x4]	2.09x 	 2572.53  	 5365.68
chroma_hpp[  4x8]	2.56x 	 1627.50  	 4165.15
chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.99
chroma_hpp[ 16x8]	2.56x 	 6270.00  	 16067.78
chroma_hps[ 16x8]	2.08x 	 7845.10  	 16330.42
chroma_hpp[ 8x16]	2.59x 	 6400.06  	 16590.34
chroma_hps[ 8x16]	2.13x 	 6832.48  	 14520.28
chroma_hpp[32x16]	2.83x 	 24749.50 	 70050.38
chroma_hps[32x16]	5.23x 	 26652.20 	 139517.83
chroma_hpp[16x32]	2.58x 	 24927.05 	 64321.18
chroma_hps[16x32]	2.05x 	 25078.26 	 51370.84
chroma_hpp[64x32]	2.81x 	 98087.36 	 275238.38
chroma_hps[64x32]	5.24x 	 97654.80 	 511554.97
chroma_hpp[32x64]	2.85x 	 98561.22 	 281336.09
chroma_hps[32x64]	5.24x 	 93774.05 	 491252.31
chroma_hpp[16x12]	2.58x 	 9370.03  	 24150.00
chroma_hps[16x12]	2.08x 	 10664.99 	 22170.42
chroma_hpp[12x16]	2.65x 	 9215.23  	 24463.34
chroma_hps[12x16]	2.13x 	 10122.50 	 21550.28
chroma_hpp[ 16x4]	2.54x 	 3180.00  	 8063.44
chroma_hps[ 16x4]	2.06x 	 5020.00  	 10350.29
chroma_hpp[ 4x16]	2.60x 	 3175.00  	 8270.13
chroma_hps[ 4x16]	2.17x 	 3450.00  	 7490.27
chroma_hpp[32x24]	2.84x 	 37023.25 	 105046.36
chroma_hps[32x24]	5.14x 	 38119.38 	 195780.92
chroma_hpp[24x32]	2.87x 	 37152.77 	 106763.15
chroma_hps[24x32]	5.27x 	 37088.59 	 195467.23
chroma_hpp[ 32x8]	2.84x 	 12360.00 	 35050.15
chroma_hps[ 32x8]	5.27x 	 15434.85 	 81414.41
chroma_hpp[ 8x32]	2.59x 	 12720.11 	 32910.00
chroma_hps[ 8x32]	2.07x 	 12805.16 	 26520.33
chroma_hpp[64x48]	2.81x 	 147267.20 	 413479.91
chroma_hps[64x48]	5.26x 	 142341.56 	 748049.62
chroma_hpp[48x64]	2.83x 	 147596.70 	 417422.91
chroma_hps[48x64]	5.19x 	 140484.31 	 729314.38
chroma_hpp[64x16]	2.80x 	 49059.62 	 137424.61
chroma_hps[64x16]	5.24x 	 52911.45 	 277367.06
chroma_hpp[16x64]	2.57x 	 49811.68 	 128246.03
chroma_hps[16x64]	2.07x 	 47422.34 	 98353.62

diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 02 12:35:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 02 08:59:07 2015 -0700
@@ -935,6 +935,15 @@
 
         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
     }
+    if (cpuMask & X265_CPU_SSE3)
+    {
+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+    }
     if (cpuMask & X265_CPU_SSSE3)
     {
         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Jun 02 12:35:53 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Tue Jun 02 08:59:07 2015 -0700
@@ -564,6 +564,395 @@
     FILTER_VER_LUMA_sse2 ps, 64, 16
     FILTER_VER_LUMA_sse2 ps, 16, 64
 
+%macro FILTERH_W2_4_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + r1 + %1]
+    movhps      m4,     [r0 + r1 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+    movh        m5,     [r0 + 2 * r1 + %1]
+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + r4 + %1]
+    movhps      m4,     [r0 + r4 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     2
+    psrad       m5,     2
+    packssdw    m3,     m5
+%endif
+    movd        [r2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 * 2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r5 + %1], m3
+%endmacro
+
+%macro FILTERH_W2_3_sse3 1
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + r1 + %1]
+    movhps      m4,     [r0 + r1 + %1 + 2]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + 2 * r1 + %1]
+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m5,     m0
+
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m5,     m5,     q3120
+    paddd       m5,     m1
+
+    psrad       m3,     2
+    psrad       m5,     2
+    packssdw    m3,     m5
+
+    movd        [r2 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 + %1], m3
+    psrldq      m3,     4
+    movd        [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_2_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + %1 + 4]
+    movhps      m4,     [r0 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + r1 + %1]
+    movhps      m5,     [r0 + r1 + %1 + 2]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + r1 + %1 + 4]
+    movhps      m4,     [r0 + r1 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     2
+    psrad       m5,     2
+    packssdw    m3,     m5
+%endif
+    movh        [r2 + %1], m3
+    movhps      [r2 + r3 + %1], m3
+%endmacro
+
+%macro FILTERH_W4_1_sse3 1
+    movh        m3,     [r0 + 2 * r1 + %1]
+    movhps      m3,     [r0 + 2 * r1 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + 2 * r1 + %1 + 4]
+    movhps      m4,     [r0 + 2 * r1 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    psrad       m3,     2
+    packssdw    m3,     m3
+    movh        [r2 + r3 * 2 + %1], m3
+%endmacro
+
+%macro FILTERH_W8_1_sse3 2
+    movh        m3,     [r0 + %1]
+    movhps      m3,     [r0 + %1 + 2]
+    pmaddwd     m3,     m0
+    movh        m4,     [r0 + %1 + 4]
+    movhps      m4,     [r0 + %1 + 6]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m3,     q2301
+    paddd       m3,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m3,     m3,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m3,     m4
+    paddd       m3,     m1
+
+    movh        m5,     [r0 + %1 + 8]
+    movhps      m5,     [r0 + %1 + 10]
+    pmaddwd     m5,     m0
+    movh        m4,     [r0 + %1 + 12]
+    movhps      m4,     [r0 + %1 + 14]
+    pmaddwd     m4,     m0
+    pshufd      m2,     m5,     q2301
+    paddd       m5,     m2
+    pshufd      m2,     m4,     q2301
+    paddd       m4,     m2
+    pshufd      m5,     m5,     q3120
+    pshufd      m4,     m4,     q3120
+    punpcklqdq  m5,     m4
+    paddd       m5,     m1
+%ifidn %2, pp
+    psrad       m3,     6
+    psrad       m5,     6
+    packssdw    m3,     m5
+    CLIPW       m3,     m7,     m6
+%else
+    psrad       m3,     2
+    psrad       m5,     2
+    packssdw    m3,     m5
+%endif
+    movdqu      [r2 + %1], m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_HOR_CHROMA_sse3 3
+INIT_XMM sse3
+cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8
+    add         r3,     r3
+    add         r1,     r1
+    sub         r0,     2
+    mov         r4d,    r4m
+    add         r4d,    r4d
+
+%ifdef PIC
+    lea         r6,     [tab_ChromaCoeff]
+    movddup     m0,     [r6 + r4 * 4]
+%else
+    movddup     m0,     [tab_ChromaCoeff + r4 * 4]
+%endif
+
+%ifidn %3, ps
+    mova        m1,     [tab_c_n32768]
+    cmp         r5m,    byte 0
+%if %1 <= 6
+    lea         r4,     [r1 * 3]
+    lea         r5,     [r3 * 3]
+%endif
+    je          .skip
+    sub         r0,     r1
+%if %1 <= 6
+%assign y 1
+%else
+%assign y 3
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+    FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0)
+    FILTERH_W4_2_sse3 x, %3
+    FILTERH_W4_1_sse3 x
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && z == 0)
+    FILTERH_W2_3_sse3 x
+%endif
+%if %1 <= 6
+    lea         r0,     [r0 + r4]
+    lea         r2,     [r2 + r5]
+%else
+    lea         r0,     [r0 + r1]
+    lea         r2,     [r2 + r3]
+%endif
+%assign z z+1
+%endrep
+.skip:
+%elifidn %3, pp
+    pxor        m7,     m7
+    mova        m6,     [pw_pixel_max]
+    mova        m1,     [tab_c_32]
+%if %1 == 2 || %1 == 6
+    lea         r4,     [r1 * 3]
+    lea         r5,     [r3 * 3]
+%endif
+%endif
+
+%if %1 == 2
+%assign y %2/4
+%elif %1 <= 6
+%assign y %2/2
+%else
+%assign y %2
+%endif
+%assign z 0
+%rep y
+%assign x 0
+%rep %1/8
+    FILTERH_W8_1_sse3 x, %3
+%assign x x+16
+%endrep
+%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0)
+    FILTERH_W4_2_sse3 x, %3
+%assign x x+8
+%endif
+%if %1 == 2 || (%1 == 6 && (z % 2) == 0)
+    FILTERH_W2_4_sse3 x, %3
+%endif
+%assign z z+1
+%if z < y
+%if %1 == 2
+    lea         r0,     [r0 + 4 * r1]
+    lea         r2,     [r2 + 4 * r3]
+%elif %1 <= 6
+    lea         r0,     [r0 + 2 * r1]
+    lea         r2,     [r2 + 2 * r3]
+%else
+    lea         r0,     [r0 + r1]
+    lea         r2,     [r2 + r3]
+%endif
+%endif ;z < y
+%endrep
+
+RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, pp
+FILTER_HOR_CHROMA_sse3 2, 8, pp
+FILTER_HOR_CHROMA_sse3 2, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 2, pp
+FILTER_HOR_CHROMA_sse3 4, 4, pp
+FILTER_HOR_CHROMA_sse3 4, 8, pp
+FILTER_HOR_CHROMA_sse3 4, 16, pp
+FILTER_HOR_CHROMA_sse3 4, 32, pp
+FILTER_HOR_CHROMA_sse3 6, 8, pp
+FILTER_HOR_CHROMA_sse3 6, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 2, pp
+FILTER_HOR_CHROMA_sse3 8, 4, pp
+FILTER_HOR_CHROMA_sse3 8, 6, pp
+FILTER_HOR_CHROMA_sse3 8, 8, pp
+FILTER_HOR_CHROMA_sse3 8, 12, pp
+FILTER_HOR_CHROMA_sse3 8, 16, pp
+FILTER_HOR_CHROMA_sse3 8, 32, pp
+FILTER_HOR_CHROMA_sse3 8, 64, pp
+FILTER_HOR_CHROMA_sse3 12, 16, pp
+FILTER_HOR_CHROMA_sse3 12, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 4, pp
+FILTER_HOR_CHROMA_sse3 16, 8, pp
+FILTER_HOR_CHROMA_sse3 16, 12, pp
+FILTER_HOR_CHROMA_sse3 16, 16, pp
+FILTER_HOR_CHROMA_sse3 16, 24, pp
+FILTER_HOR_CHROMA_sse3 16, 32, pp
+FILTER_HOR_CHROMA_sse3 16, 64, pp
+FILTER_HOR_CHROMA_sse3 24, 32, pp
+FILTER_HOR_CHROMA_sse3 24, 64, pp
+FILTER_HOR_CHROMA_sse3 32, 8, pp
+FILTER_HOR_CHROMA_sse3 32, 16, pp
+FILTER_HOR_CHROMA_sse3 32, 24, pp
+FILTER_HOR_CHROMA_sse3 32, 32, pp
+FILTER_HOR_CHROMA_sse3 32, 48, pp
+FILTER_HOR_CHROMA_sse3 32, 64, pp
+FILTER_HOR_CHROMA_sse3 48, 64, pp
+FILTER_HOR_CHROMA_sse3 64, 16, pp
+FILTER_HOR_CHROMA_sse3 64, 32, pp
+FILTER_HOR_CHROMA_sse3 64, 48, pp
+FILTER_HOR_CHROMA_sse3 64, 64, pp
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+
+FILTER_HOR_CHROMA_sse3 2, 4, ps
+FILTER_HOR_CHROMA_sse3 2, 8, ps
+FILTER_HOR_CHROMA_sse3 2, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 2, ps
+FILTER_HOR_CHROMA_sse3 4, 4, ps
+FILTER_HOR_CHROMA_sse3 4, 8, ps
+FILTER_HOR_CHROMA_sse3 4, 16, ps
+FILTER_HOR_CHROMA_sse3 4, 32, ps
+FILTER_HOR_CHROMA_sse3 6, 8, ps
+FILTER_HOR_CHROMA_sse3 6, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 2, ps
+FILTER_HOR_CHROMA_sse3 8, 4, ps
+FILTER_HOR_CHROMA_sse3 8, 6, ps
+FILTER_HOR_CHROMA_sse3 8, 8, ps
+FILTER_HOR_CHROMA_sse3 8, 12, ps
+FILTER_HOR_CHROMA_sse3 8, 16, ps
+FILTER_HOR_CHROMA_sse3 8, 32, ps
+FILTER_HOR_CHROMA_sse3 8, 64, ps
+FILTER_HOR_CHROMA_sse3 12, 16, ps
+FILTER_HOR_CHROMA_sse3 12, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 4, ps
+FILTER_HOR_CHROMA_sse3 16, 8, ps
+FILTER_HOR_CHROMA_sse3 16, 12, ps
+FILTER_HOR_CHROMA_sse3 16, 16, ps
+FILTER_HOR_CHROMA_sse3 16, 24, ps
+FILTER_HOR_CHROMA_sse3 16, 32, ps
+FILTER_HOR_CHROMA_sse3 16, 64, ps
+FILTER_HOR_CHROMA_sse3 24, 32, ps
+FILTER_HOR_CHROMA_sse3 24, 64, ps
+FILTER_HOR_CHROMA_sse3 32, 8, ps
+FILTER_HOR_CHROMA_sse3 32, 16, ps
+FILTER_HOR_CHROMA_sse3 32, 24, ps
+FILTER_HOR_CHROMA_sse3 32, 32, ps
+FILTER_HOR_CHROMA_sse3 32, 48, ps
+FILTER_HOR_CHROMA_sse3 32, 64, ps
+FILTER_HOR_CHROMA_sse3 48, 64, ps
+FILTER_HOR_CHROMA_sse3 64, 16, ps
+FILTER_HOR_CHROMA_sse3 64, 32, ps
+FILTER_HOR_CHROMA_sse3 64, 48, ps
+FILTER_HOR_CHROMA_sse3 64, 64, ps
+
 ;------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list