[x265] [PATCH] asm: interp_4tap_horiz_pX sse3 10-bit

chen chenm003 at 163.com
Tue Jun 2 18:16:34 CEST 2015


movdqu      -> movu

others fine
 
At 2015-06-03 00:02:36,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1433260747 25200
># Node ID fcfba27ecf0b9dac8da123da8cdcac75763496f3
># Parent  0f0d88319f7cc96661eef3c3dcc1befcf60354f3
>asm: interp_4tap_horiz_pX sse3 10-bit
>
>This replaces c code for all of 4tap_horiz pp and ps.
>
>64-bit
>
>./test/TestBench --testbench interp | grep chroma_hp;bp
>chroma_hpp[  4x4]	2.23x 	 854.98   	 1902.48
>chroma_hps[  4x4]	2.05x 	 1354.99  	 2778.18
>chroma_hpp[  8x8]	2.57x 	 3235.00  	 8320.13
>chroma_hps[  8x8]	2.14x 	 3982.52  	 8520.44
>chroma_hpp[16x16]	2.58x 	 12467.41 	 32145.56
>chroma_hps[16x16]	2.08x 	 13478.99 	 28010.00
>chroma_hpp[32x32]	2.85x 	 49318.75 	 140794.05
>chroma_hps[32x32]	5.19x 	 49005.44 	 254361.41
>chroma_hpp[  4x2]	2.22x 	 457.50   	 1014.99
>chroma_hps[  4x2]	2.01x 	 1004.99  	 2020.98
>chroma_hpp[  2x4]	2.12x 	 512.50   	 1087.49
>chroma_hps[  2x4]	1.85x 	 820.00   	 1520.58
>chroma_hpp[  8x4]	2.55x 	 1655.00  	 4225.89
>chroma_hps[  8x4]	2.09x 	 2572.48  	 5365.30
>chroma_hpp[  4x8]	2.56x 	 1625.01  	 4167.64
>chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.99
>chroma_hpp[ 16x8]	2.56x 	 6275.00  	 16070.69
>chroma_hps[ 16x8]	2.11x 	 7840.00  	 16520.78
>chroma_hpp[ 8x16]	2.62x 	 6395.00  	 16779.55
>chroma_hps[ 8x16]	2.13x 	 6832.50  	 14520.42
>chroma_hpp[32x16]	2.84x 	 24707.68 	 70049.97
>chroma_hps[32x16]	5.19x 	 26659.50 	 138467.00
>chroma_hpp[16x32]	2.58x 	 24917.14 	 64318.72
>chroma_hps[16x32]	2.07x 	 24819.08 	 51370.22
>chroma_hpp[  8x6]	2.57x 	 2450.00  	 6293.12
>chroma_hps[  8x6]	2.10x 	 3282.56  	 6900.30
>chroma_hpp[  6x8]	2.57x 	 2429.24  	 6240.71
>chroma_hps[  6x8]	2.07x 	 3160.13  	 6550.00
>chroma_hpp[  8x2]	2.19x 	 872.52   	 1909.97
>chroma_hps[  8x2]	2.06x 	 1877.50  	 3865.74
>chroma_hpp[  2x8]	2.16x 	 927.49   	 2005.01
>chroma_hps[  2x8]	2.01x 	 1227.50  	 2470.27
>chroma_hpp[16x12]	2.58x 	 9371.36  	 24152.49
>chroma_hps[16x12]	2.10x 	 10665.00 	 22398.23
>chroma_hpp[12x16]	2.65x 	 9215.00  	 24465.62
>chroma_hps[12x16]	2.13x 	 10122.57 	 21550.70
>chroma_hpp[ 16x4]	2.54x 	 3170.00  	 8065.96
>chroma_hps[ 16x4]	2.06x 	 5020.00  	 10350.29
>chroma_hpp[ 4x16]	2.60x 	 3175.00  	 8270.00
>chroma_hps[ 4x16]	2.17x 	 3450.00  	 7490.13
>chroma_hpp[32x24]	2.85x 	 36976.00 	 105213.45
>chroma_hps[32x24]	5.27x 	 37801.41 	 199137.09
>chroma_hpp[24x32]	2.88x 	 37093.77 	 106880.20
>chroma_hps[24x32]	5.24x 	 36958.88 	 193609.19
>chroma_hpp[ 32x8]	2.84x 	 12360.07 	 35049.98
>chroma_hps[ 32x8]	5.24x 	 15477.46 	 81162.80
>chroma_hpp[ 8x32]	2.59x 	 12721.46 	 32910.11
>chroma_hps[ 8x32]	2.12x 	 12527.55 	 26520.00
>chroma_hpp[  4x8]	2.56x 	 1627.50  	 4165.00
>chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.98
>chroma_hpp[ 8x16]	2.59x 	 6405.00  	 16590.00
>chroma_hps[ 8x16]	2.12x 	 6847.50  	 14520.00
>chroma_hpp[16x32]	2.58x 	 24927.11 	 64341.25
>chroma_hps[16x32]	2.07x 	 24812.88 	 51371.34
>chroma_hpp[32x64]	2.86x 	 98547.48 	 281379.03
>chroma_hps[32x64]	5.26x 	 93765.83 	 493587.72
>chroma_hpp[  4x4]	2.21x 	 859.99   	 1900.00
>chroma_hps[  4x4]	2.06x 	 1350.00  	 2780.58
>chroma_hpp[  2x8]	2.16x 	 927.50   	 2007.47
>chroma_hps[  2x8]	2.01x 	 1227.50  	 2469.99
>chroma_hpp[  8x8]	2.57x 	 3240.00  	 8317.64
>chroma_hps[  8x8]	2.13x 	 3992.50  	 8520.16
>chroma_hpp[ 4x16]	2.60x 	 3180.10  	 8270.48
>chroma_hps[ 4x16]	2.17x 	 3457.50  	 7490.13
>chroma_hpp[16x16]	2.58x 	 12480.00 	 32143.02
>chroma_hps[16x16]	2.08x 	 13485.08 	 28010.00
>chroma_hpp[ 8x32]	2.59x 	 12722.10 	 32910.00
>chroma_hps[ 8x32]	2.12x 	 12527.69 	 26520.00
>chroma_hpp[32x32]	2.85x 	 49298.53 	 140412.20
>chroma_hps[32x32]	5.23x 	 49040.32 	 256335.03
>chroma_hpp[16x64]	2.58x 	 49773.44 	 128440.18
>chroma_hps[16x64]	2.07x 	 47499.16 	 98369.34
>chroma_hpp[ 8x12]	2.60x 	 4820.00  	 12510.08
>chroma_hps[ 8x12]	2.12x 	 5427.50  	 11520.14
>chroma_hpp[ 6x16]	2.60x 	 4787.50  	 12430.28
>chroma_hps[ 6x16]	2.06x 	 5396.98  	 11110.14
>chroma_hpp[  8x4]	2.54x 	 1660.01  	 4223.30
>chroma_hps[  8x4]	2.09x 	 2572.50  	 5365.00
>chroma_hpp[ 2x16]	2.56x 	 1795.00  	 4590.00
>chroma_hps[ 2x16]	1.99x 	 2045.00  	 4070.41
>chroma_hpp[16x24]	2.58x 	 18678.77 	 48220.21
>chroma_hps[16x24]	2.07x 	 19167.97 	 39690.28
>chroma_hpp[12x32]	2.65x 	 18382.11 	 48780.00
>chroma_hps[12x32]	2.13x 	 18610.71 	 39617.65
>chroma_hpp[ 16x8]	2.56x 	 6280.00  	 16067.94
>chroma_hps[ 16x8]	2.11x 	 7840.00  	 16521.46
>chroma_hpp[ 4x32]	2.60x 	 6260.00  	 16270.26
>chroma_hps[ 4x32]	2.17x 	 6257.50  	 13569.99
>chroma_hpp[32x48]	2.85x 	 73959.77 	 211129.89
>chroma_hps[32x48]	5.25x 	 71408.45 	 374964.56
>chroma_hpp[24x64]	2.89x 	 74166.45 	 214010.20
>chroma_hps[24x64]	5.26x 	 70630.15 	 371645.31
>chroma_hpp[32x16]	2.85x 	 24694.61 	 70257.82
>chroma_hps[32x16]	5.17x 	 26643.62 	 137760.22
>chroma_hpp[ 8x64]	2.59x 	 25427.29 	 65796.38
>chroma_hps[ 8x64]	2.12x 	 23935.77 	 50771.55
>chroma_hpp[  4x4]	2.21x 	 860.00   	 1900.00
>chroma_hps[  4x4]	2.06x 	 1349.99  	 2780.30
>chroma_hpp[  8x8]	2.57x 	 3240.00  	 8317.98
>chroma_hps[  8x8]	2.13x 	 3992.53  	 8520.13
>chroma_hpp[16x16]	2.58x 	 12479.99 	 32142.50
>chroma_hps[16x16]	2.08x 	 13480.09 	 28010.00
>chroma_hpp[32x32]	2.85x 	 49309.57 	 140403.56
>chroma_hps[32x32]	5.25x 	 49011.53 	 257550.23
>chroma_hpp[64x64]	2.38x 	 231651.14 	 551080.75
>chroma_hps[64x64]	4.38x 	 223849.25 	 980566.25
>chroma_hpp[  8x4]	2.54x 	 1664.99  	 4223.43
>chroma_hps[  8x4]	2.09x 	 2572.53  	 5365.68
>chroma_hpp[  4x8]	2.56x 	 1627.50  	 4165.15
>chroma_hps[  4x8]	2.16x 	 2057.50  	 4449.99
>chroma_hpp[ 16x8]	2.56x 	 6270.00  	 16067.78
>chroma_hps[ 16x8]	2.08x 	 7845.10  	 16330.42
>chroma_hpp[ 8x16]	2.59x 	 6400.06  	 16590.34
>chroma_hps[ 8x16]	2.13x 	 6832.48  	 14520.28
>chroma_hpp[32x16]	2.83x 	 24749.50 	 70050.38
>chroma_hps[32x16]	5.23x 	 26652.20 	 139517.83
>chroma_hpp[16x32]	2.58x 	 24927.05 	 64321.18
>chroma_hps[16x32]	2.05x 	 25078.26 	 51370.84
>chroma_hpp[64x32]	2.81x 	 98087.36 	 275238.38
>chroma_hps[64x32]	5.24x 	 97654.80 	 511554.97
>chroma_hpp[32x64]	2.85x 	 98561.22 	 281336.09
>chroma_hps[32x64]	5.24x 	 93774.05 	 491252.31
>chroma_hpp[16x12]	2.58x 	 9370.03  	 24150.00
>chroma_hps[16x12]	2.08x 	 10664.99 	 22170.42
>chroma_hpp[12x16]	2.65x 	 9215.23  	 24463.34
>chroma_hps[12x16]	2.13x 	 10122.50 	 21550.28
>chroma_hpp[ 16x4]	2.54x 	 3180.00  	 8063.44
>chroma_hps[ 16x4]	2.06x 	 5020.00  	 10350.29
>chroma_hpp[ 4x16]	2.60x 	 3175.00  	 8270.13
>chroma_hps[ 4x16]	2.17x 	 3450.00  	 7490.27
>chroma_hpp[32x24]	2.84x 	 37023.25 	 105046.36
>chroma_hps[32x24]	5.14x 	 38119.38 	 195780.92
>chroma_hpp[24x32]	2.87x 	 37152.77 	 106763.15
>chroma_hps[24x32]	5.27x 	 37088.59 	 195467.23
>chroma_hpp[ 32x8]	2.84x 	 12360.00 	 35050.15
>chroma_hps[ 32x8]	5.27x 	 15434.85 	 81414.41
>chroma_hpp[ 8x32]	2.59x 	 12720.11 	 32910.00
>chroma_hps[ 8x32]	2.07x 	 12805.16 	 26520.33
>chroma_hpp[64x48]	2.81x 	 147267.20 	 413479.91
>chroma_hps[64x48]	5.26x 	 142341.56 	 748049.62
>chroma_hpp[48x64]	2.83x 	 147596.70 	 417422.91
>chroma_hps[48x64]	5.19x 	 140484.31 	 729314.38
>chroma_hpp[64x16]	2.80x 	 49059.62 	 137424.61
>chroma_hps[64x16]	5.24x 	 52911.45 	 277367.06
>chroma_hpp[16x64]	2.57x 	 49811.68 	 128246.03
>chroma_hps[16x64]	2.07x 	 47422.34 	 98353.62
>
>diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Tue Jun 02 12:35:53 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 02 08:59:07 2015 -0700
>@@ -935,6 +935,15 @@
> 
>         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>     }
>+    if (cpuMask & X265_CPU_SSE3)
>+    {
>+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+        ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+        ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+        ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
>+    }
>     if (cpuMask & X265_CPU_SSSE3)
>     {
>         p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
>diff -r 0f0d88319f7c -r fcfba27ecf0b source/common/x86/ipfilter16.asm
>--- a/source/common/x86/ipfilter16.asm	Tue Jun 02 12:35:53 2015 +0530
>+++ b/source/common/x86/ipfilter16.asm	Tue Jun 02 08:59:07 2015 -0700
>@@ -564,6 +564,395 @@
>     FILTER_VER_LUMA_sse2 ps, 64, 16
>     FILTER_VER_LUMA_sse2 ps, 16, 64
> 
>+%macro FILTERH_W2_4_sse3 2
>+    movh        m3,     [r0 + %1]
>+    movhps      m3,     [r0 + %1 + 2]
>+    pmaddwd     m3,     m0
>+    movh        m4,     [r0 + r1 + %1]
>+    movhps      m4,     [r0 + r1 + %1 + 2]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m3,     q2301
>+    paddd       m3,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m3,     m3,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m3,     m4
>+    paddd       m3,     m1
>+    movh        m5,     [r0 + 2 * r1 + %1]
>+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
>+    pmaddwd     m5,     m0
>+    movh        m4,     [r0 + r4 + %1]
>+    movhps      m4,     [r0 + r4 + %1 + 2]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m5,     q2301
>+    paddd       m5,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m5,     m5,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m5,     m4
>+    paddd       m5,     m1
>+%ifidn %2, pp
>+    psrad       m3,     6
>+    psrad       m5,     6
>+    packssdw    m3,     m5
>+    CLIPW       m3,     m7,     m6
>+%else
>+    psrad       m3,     2
>+    psrad       m5,     2
>+    packssdw    m3,     m5
>+%endif
>+    movd        [r2 + %1], m3
>+    psrldq      m3,     4
>+    movd        [r2 + r3 + %1], m3
>+    psrldq      m3,     4
>+    movd        [r2 + r3 * 2 + %1], m3
>+    psrldq      m3,     4
>+    movd        [r2 + r5 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W2_3_sse3 1
>+    movh        m3,     [r0 + %1]
>+    movhps      m3,     [r0 + %1 + 2]
>+    pmaddwd     m3,     m0
>+    movh        m4,     [r0 + r1 + %1]
>+    movhps      m4,     [r0 + r1 + %1 + 2]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m3,     q2301
>+    paddd       m3,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m3,     m3,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m3,     m4
>+    paddd       m3,     m1
>+
>+    movh        m5,     [r0 + 2 * r1 + %1]
>+    movhps      m5,     [r0 + 2 * r1 + %1 + 2]
>+    pmaddwd     m5,     m0
>+
>+    pshufd      m2,     m5,     q2301
>+    paddd       m5,     m2
>+    pshufd      m5,     m5,     q3120
>+    paddd       m5,     m1
>+
>+    psrad       m3,     2
>+    psrad       m5,     2
>+    packssdw    m3,     m5
>+
>+    movd        [r2 + %1], m3
>+    psrldq      m3,     4
>+    movd        [r2 + r3 + %1], m3
>+    psrldq      m3,     4
>+    movd        [r2 + r3 * 2 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W4_2_sse3 2
>+    movh        m3,     [r0 + %1]
>+    movhps      m3,     [r0 + %1 + 2]
>+    pmaddwd     m3,     m0
>+    movh        m4,     [r0 + %1 + 4]
>+    movhps      m4,     [r0 + %1 + 6]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m3,     q2301
>+    paddd       m3,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m3,     m3,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m3,     m4
>+    paddd       m3,     m1
>+
>+    movh        m5,     [r0 + r1 + %1]
>+    movhps      m5,     [r0 + r1 + %1 + 2]
>+    pmaddwd     m5,     m0
>+    movh        m4,     [r0 + r1 + %1 + 4]
>+    movhps      m4,     [r0 + r1 + %1 + 6]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m5,     q2301
>+    paddd       m5,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m5,     m5,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m5,     m4
>+    paddd       m5,     m1
>+%ifidn %2, pp
>+    psrad       m3,     6
>+    psrad       m5,     6
>+    packssdw    m3,     m5
>+    CLIPW       m3,     m7,     m6
>+%else
>+    psrad       m3,     2
>+    psrad       m5,     2
>+    packssdw    m3,     m5
>+%endif
>+    movh        [r2 + %1], m3
>+    movhps      [r2 + r3 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W4_1_sse3 1
>+    movh        m3,     [r0 + 2 * r1 + %1]
>+    movhps      m3,     [r0 + 2 * r1 + %1 + 2]
>+    pmaddwd     m3,     m0
>+    movh        m4,     [r0 + 2 * r1 + %1 + 4]
>+    movhps      m4,     [r0 + 2 * r1 + %1 + 6]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m3,     q2301
>+    paddd       m3,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m3,     m3,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m3,     m4
>+    paddd       m3,     m1
>+
>+    psrad       m3,     2
>+    packssdw    m3,     m3
>+    movh        [r2 + r3 * 2 + %1], m3
>+%endmacro
>+
>+%macro FILTERH_W8_1_sse3 2
>+    movh        m3,     [r0 + %1]
>+    movhps      m3,     [r0 + %1 + 2]
>+    pmaddwd     m3,     m0
>+    movh        m4,     [r0 + %1 + 4]
>+    movhps      m4,     [r0 + %1 + 6]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m3,     q2301
>+    paddd       m3,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m3,     m3,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m3,     m4
>+    paddd       m3,     m1
>+
>+    movh        m5,     [r0 + %1 + 8]
>+    movhps      m5,     [r0 + %1 + 10]
>+    pmaddwd     m5,     m0
>+    movh        m4,     [r0 + %1 + 12]
>+    movhps      m4,     [r0 + %1 + 14]
>+    pmaddwd     m4,     m0
>+    pshufd      m2,     m5,     q2301
>+    paddd       m5,     m2
>+    pshufd      m2,     m4,     q2301
>+    paddd       m4,     m2
>+    pshufd      m5,     m5,     q3120
>+    pshufd      m4,     m4,     q3120
>+    punpcklqdq  m5,     m4
>+    paddd       m5,     m1
>+%ifidn %2, pp
>+    psrad       m3,     6
>+    psrad       m5,     6
>+    packssdw    m3,     m5
>+    CLIPW       m3,     m7,     m6
>+%else
>+    psrad       m3,     2
>+    psrad       m5,     2
>+    packssdw    m3,     m5
>+%endif
>+    movdqu      [r2 + %1], m3
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+%macro FILTER_HOR_CHROMA_sse3 3
>+INIT_XMM sse3
>+cglobal interp_4tap_horiz_%3_%1x%2, 4, 7, 8
>+    add         r3,     r3
>+    add         r1,     r1
>+    sub         r0,     2
>+    mov         r4d,    r4m
>+    add         r4d,    r4d
>+
>+%ifdef PIC
>+    lea         r6,     [tab_ChromaCoeff]
>+    movddup     m0,     [r6 + r4 * 4]
>+%else
>+    movddup     m0,     [tab_ChromaCoeff + r4 * 4]
>+%endif
>+
>+%ifidn %3, ps
>+    mova        m1,     [tab_c_n32768]
>+    cmp         r5m,    byte 0
>+%if %1 <= 6
>+    lea         r4,     [r1 * 3]
>+    lea         r5,     [r3 * 3]
>+%endif
>+    je          .skip
>+    sub         r0,     r1
>+%if %1 <= 6
>+%assign y 1
>+%else
>+%assign y 3
>+%endif
>+%assign z 0
>+%rep y
>+%assign x 0
>+%rep %1/8
>+    FILTERH_W8_1_sse3 x, %3
>+%assign x x+16
>+%endrep
>+%if %1 == 4 || (%1 == 6 && z == 0) || (%1 == 12 && z == 0)
>+    FILTERH_W4_2_sse3 x, %3
>+    FILTERH_W4_1_sse3 x
>+%assign x x+8
>+%endif
>+%if %1 == 2 || (%1 == 6 && z == 0)
>+    FILTERH_W2_3_sse3 x
>+%endif
>+%if %1 <= 6
>+    lea         r0,     [r0 + r4]
>+    lea         r2,     [r2 + r5]
>+%else
>+    lea         r0,     [r0 + r1]
>+    lea         r2,     [r2 + r3]
>+%endif
>+%assign z z+1
>+%endrep
>+.skip:
>+%elifidn %3, pp
>+    pxor        m7,     m7
>+    mova        m6,     [pw_pixel_max]
>+    mova        m1,     [tab_c_32]
>+%if %1 == 2 || %1 == 6
>+    lea         r4,     [r1 * 3]
>+    lea         r5,     [r3 * 3]
>+%endif
>+%endif
>+
>+%if %1 == 2
>+%assign y %2/4
>+%elif %1 <= 6
>+%assign y %2/2
>+%else
>+%assign y %2
>+%endif
>+%assign z 0
>+%rep y
>+%assign x 0
>+%rep %1/8
>+    FILTERH_W8_1_sse3 x, %3
>+%assign x x+16
>+%endrep
>+%if %1 == 4 || %1 == 6 || (%1 == 12 && (z % 2) == 0)
>+    FILTERH_W4_2_sse3 x, %3
>+%assign x x+8
>+%endif
>+%if %1 == 2 || (%1 == 6 && (z % 2) == 0)
>+    FILTERH_W2_4_sse3 x, %3
>+%endif
>+%assign z z+1
>+%if z < y
>+%if %1 == 2
>+    lea         r0,     [r0 + 4 * r1]
>+    lea         r2,     [r2 + 4 * r3]
>+%elif %1 <= 6
>+    lea         r0,     [r0 + 2 * r1]
>+    lea         r2,     [r2 + 2 * r3]
>+%else
>+    lea         r0,     [r0 + r1]
>+    lea         r2,     [r2 + r3]
>+%endif
>+%endif ;z < y
>+%endrep
>+
>+RET
>+%endmacro
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+
>+FILTER_HOR_CHROMA_sse3 2, 4, pp
>+FILTER_HOR_CHROMA_sse3 2, 8, pp
>+FILTER_HOR_CHROMA_sse3 2, 16, pp
>+FILTER_HOR_CHROMA_sse3 4, 2, pp
>+FILTER_HOR_CHROMA_sse3 4, 4, pp
>+FILTER_HOR_CHROMA_sse3 4, 8, pp
>+FILTER_HOR_CHROMA_sse3 4, 16, pp
>+FILTER_HOR_CHROMA_sse3 4, 32, pp
>+FILTER_HOR_CHROMA_sse3 6, 8, pp
>+FILTER_HOR_CHROMA_sse3 6, 16, pp
>+FILTER_HOR_CHROMA_sse3 8, 2, pp
>+FILTER_HOR_CHROMA_sse3 8, 4, pp
>+FILTER_HOR_CHROMA_sse3 8, 6, pp
>+FILTER_HOR_CHROMA_sse3 8, 8, pp
>+FILTER_HOR_CHROMA_sse3 8, 12, pp
>+FILTER_HOR_CHROMA_sse3 8, 16, pp
>+FILTER_HOR_CHROMA_sse3 8, 32, pp
>+FILTER_HOR_CHROMA_sse3 8, 64, pp
>+FILTER_HOR_CHROMA_sse3 12, 16, pp
>+FILTER_HOR_CHROMA_sse3 12, 32, pp
>+FILTER_HOR_CHROMA_sse3 16, 4, pp
>+FILTER_HOR_CHROMA_sse3 16, 8, pp
>+FILTER_HOR_CHROMA_sse3 16, 12, pp
>+FILTER_HOR_CHROMA_sse3 16, 16, pp
>+FILTER_HOR_CHROMA_sse3 16, 24, pp
>+FILTER_HOR_CHROMA_sse3 16, 32, pp
>+FILTER_HOR_CHROMA_sse3 16, 64, pp
>+FILTER_HOR_CHROMA_sse3 24, 32, pp
>+FILTER_HOR_CHROMA_sse3 24, 64, pp
>+FILTER_HOR_CHROMA_sse3 32, 8, pp
>+FILTER_HOR_CHROMA_sse3 32, 16, pp
>+FILTER_HOR_CHROMA_sse3 32, 24, pp
>+FILTER_HOR_CHROMA_sse3 32, 32, pp
>+FILTER_HOR_CHROMA_sse3 32, 48, pp
>+FILTER_HOR_CHROMA_sse3 32, 64, pp
>+FILTER_HOR_CHROMA_sse3 48, 64, pp
>+FILTER_HOR_CHROMA_sse3 64, 16, pp
>+FILTER_HOR_CHROMA_sse3 64, 32, pp
>+FILTER_HOR_CHROMA_sse3 64, 48, pp
>+FILTER_HOR_CHROMA_sse3 64, 64, pp
>+
>+;-----------------------------------------------------------------------------
>+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>+;-----------------------------------------------------------------------------
>+
>+FILTER_HOR_CHROMA_sse3 2, 4, ps
>+FILTER_HOR_CHROMA_sse3 2, 8, ps
>+FILTER_HOR_CHROMA_sse3 2, 16, ps
>+FILTER_HOR_CHROMA_sse3 4, 2, ps
>+FILTER_HOR_CHROMA_sse3 4, 4, ps
>+FILTER_HOR_CHROMA_sse3 4, 8, ps
>+FILTER_HOR_CHROMA_sse3 4, 16, ps
>+FILTER_HOR_CHROMA_sse3 4, 32, ps
>+FILTER_HOR_CHROMA_sse3 6, 8, ps
>+FILTER_HOR_CHROMA_sse3 6, 16, ps
>+FILTER_HOR_CHROMA_sse3 8, 2, ps
>+FILTER_HOR_CHROMA_sse3 8, 4, ps
>+FILTER_HOR_CHROMA_sse3 8, 6, ps
>+FILTER_HOR_CHROMA_sse3 8, 8, ps
>+FILTER_HOR_CHROMA_sse3 8, 12, ps
>+FILTER_HOR_CHROMA_sse3 8, 16, ps
>+FILTER_HOR_CHROMA_sse3 8, 32, ps
>+FILTER_HOR_CHROMA_sse3 8, 64, ps
>+FILTER_HOR_CHROMA_sse3 12, 16, ps
>+FILTER_HOR_CHROMA_sse3 12, 32, ps
>+FILTER_HOR_CHROMA_sse3 16, 4, ps
>+FILTER_HOR_CHROMA_sse3 16, 8, ps
>+FILTER_HOR_CHROMA_sse3 16, 12, ps
>+FILTER_HOR_CHROMA_sse3 16, 16, ps
>+FILTER_HOR_CHROMA_sse3 16, 24, ps
>+FILTER_HOR_CHROMA_sse3 16, 32, ps
>+FILTER_HOR_CHROMA_sse3 16, 64, ps
>+FILTER_HOR_CHROMA_sse3 24, 32, ps
>+FILTER_HOR_CHROMA_sse3 24, 64, ps
>+FILTER_HOR_CHROMA_sse3 32, 8, ps
>+FILTER_HOR_CHROMA_sse3 32, 16, ps
>+FILTER_HOR_CHROMA_sse3 32, 24, ps
>+FILTER_HOR_CHROMA_sse3 32, 32, ps
>+FILTER_HOR_CHROMA_sse3 32, 48, ps
>+FILTER_HOR_CHROMA_sse3 32, 64, ps
>+FILTER_HOR_CHROMA_sse3 48, 64, ps
>+FILTER_HOR_CHROMA_sse3 64, 16, ps
>+FILTER_HOR_CHROMA_sse3 64, 32, ps
>+FILTER_HOR_CHROMA_sse3 64, 48, ps
>+FILTER_HOR_CHROMA_sse3 64, 64, ps
>+
> ;------------------------------------------------------------------------------------------------------------
> ; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> ;------------------------------------------------------------------------------------------------------------
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150603/cf6cc359/attachment-0001.html>


More information about the x265-devel mailing list