[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

Steve Borho steve at borho.org
Tue Apr 28 18:09:14 CEST 2015


On 04/27, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1430194906 25200
> # Node ID 027aaded20e76e719bb6b143de41b8739fb68b9e
> # Parent  68a13226d586b335c02cade9311e093f0149c42a
> asm: interp_8tap_horiz pp and ps sse2
> 
> This replaces c code and covers
> 
> 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32, 16x64,
> 24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64

On Mac x64 8bpp, this patch is causing a SIGSEGV in the test bench.
dequeuing

> 64-bit
> 
> ./test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]		1.86x 	 1852.87  	 3441.42
> luma_hps[  4x4]		1.78x 	 4662.80  	 8315.98
> luma_hpp[  8x8]		2.18x 	 6126.91  	 13358.12
> luma_hps[  8x8]		1.93x 	 11087.93 	 21351.43
> luma_hpp[16x16]		2.54x 	 24023.32 	 61097.57
> luma_hps[16x16]		2.27x 	 33064.12 	 74962.39
> luma_hpp[32x32]		2.52x 	 96782.22 	 244302.42
> luma_hps[32x32]		2.20x 	 111190.70 	 244826.36
> luma_hpp[64x64]		2.52x 	 382848.38 	 965268.81
> luma_hps[64x64]		2.31x 	 404641.47 	 933627.75
> luma_hpp[  8x4]		2.35x 	 3055.54  	 7175.84
> luma_hps[  8x4]		1.89x 	 8227.78  	 15575.28
> luma_hpp[  4x8]		1.87x 	 3690.05  	 6903.09
> luma_hps[  4x8]		1.81x 	 6222.84  	 11235.39
> luma_hpp[ 16x8]		2.56x 	 12006.92 	 30678.69
> luma_hps[ 16x8]		2.26x 	 21606.27 	 48876.84
> luma_hpp[ 8x16]		2.21x 	 12225.59 	 26988.44
> luma_hps[ 8x16]		1.92x 	 16809.11 	 32258.04
> luma_hpp[32x16]		2.50x 	 47966.91 	 119978.61
> luma_hps[32x16]		2.22x 	 65530.77 	 145648.20
> luma_hpp[16x32]		2.53x 	 47943.95 	 121086.88
> luma_hps[16x32]		2.32x 	 55757.34 	 129380.16
> luma_hpp[64x32]		2.51x 	 192540.17 	 482519.84
> luma_hps[64x32]		2.31x 	 223606.27 	 516736.66
> luma_hpp[32x64]		2.52x 	 191115.05 	 481093.03
> luma_hps[32x64]		2.23x 	 202603.52 	 452672.53
> luma_hpp[16x12]		2.52x 	 18149.56 	 45765.54
> luma_hps[16x12]		2.26x 	 27318.41 	 61835.58
> luma_hpp[12x16]		2.29x 	 19368.50 	 44395.32
> luma_hps[12x16]		1.85x 	 25962.37 	 48125.04
> luma_hpp[ 16x4]		2.52x 	 6027.79  	 15214.09
> luma_hps[ 16x4]		2.26x 	 15872.79 	 35917.86
> luma_hpp[ 4x16]		1.99x 	 7470.23  	 14896.98
> luma_hps[ 4x16]		1.83x 	 9342.87  	 17077.48
> luma_hpp[32x24]		2.52x 	 71642.52 	 180756.81
> luma_hps[32x24]		2.20x 	 88241.04 	 194519.33
> luma_hpp[24x32]		2.54x 	 71766.33 	 182282.92
> luma_hps[24x32]		2.24x 	 83465.88 	 187333.91
> luma_hpp[ 32x8]		2.51x 	 23823.40 	 59883.32
> luma_hps[ 32x8]		2.20x 	 42823.66 	 94268.15
> luma_hpp[ 8x32]		2.38x 	 24316.01 	 57792.96
> luma_hps[ 8x32]		1.93x 	 28384.35 	 54849.94
> luma_hpp[64x48]		2.48x 	 287461.72 	 712744.88
> luma_hps[64x48]		2.29x 	 313082.53 	 716684.25
> luma_hpp[48x64]		2.53x 	 287235.50 	 725398.94
> luma_hps[48x64]		2.10x 	 317556.03 	 667405.00
> luma_hpp[64x16]		2.53x 	 95767.20 	 241838.22
> luma_hps[64x16]		2.33x 	 130718.30 	 304524.62
> luma_hpp[16x64]		2.57x 	 95334.80 	 244946.48
> luma_hps[16x64]		2.28x 	 101269.19 	 231212.62
> 
> 32-bit
> 
> /test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]		2.03x 	 1855.35  	 3763.12
> luma_hps[  4x4]		1.79x 	 4827.67  	 8630.31
> luma_hpp[  8x8]		2.34x 	 6185.32  	 14485.46
> luma_hps[  8x8]		1.93x 	 11082.82 	 21390.96
> luma_hpp[16x16]		2.68x 	 24277.91 	 65107.63
> luma_hps[16x16]		2.33x 	 32964.77 	 76937.46
> luma_hpp[32x32]		2.61x 	 95937.99 	 250347.53
> luma_hps[32x32]		2.24x 	 110881.09 	 248610.78
> luma_hpp[64x64]		2.59x 	 384984.62 	 996606.31
> luma_hps[64x64]		2.20x 	 405764.12 	 893234.75
> luma_hpp[  8x4]		2.28x 	 3180.56  	 7265.61
> luma_hps[  8x4]		1.92x 	 8242.61  	 15790.53
> luma_hpp[  4x8]		2.12x 	 3606.77  	 7652.88
> luma_hps[  4x8]		1.81x 	 6427.69  	 11630.39
> luma_hpp[ 16x8]		2.67x 	 12140.81 	 32355.57
> luma_hps[ 16x8]		2.33x 	 21643.76 	 50358.95
> luma_hpp[ 8x16]		2.36x 	 12278.08 	 29025.54
> luma_hps[ 8x16]		1.94x 	 16762.84 	 32590.21
> luma_hpp[32x16]		2.62x 	 47968.25 	 125563.23
> luma_hps[32x16]		2.25x 	 65772.64 	 147959.81
> luma_hpp[16x32]		2.69x 	 48010.28 	 129074.63
> luma_hps[16x32]		2.32x 	 56048.27 	 130077.62
> luma_hpp[64x32]		2.57x 	 191772.20 	 493535.38
> luma_hps[64x32]		2.22x 	 222292.73 	 493297.94
> luma_hpp[32x64]		2.65x 	 191459.34 	 506724.47
> luma_hps[32x64]		2.24x 	 202199.86 	 452178.41
> luma_hpp[16x12]		2.67x 	 18317.57 	 48935.64
> luma_hps[16x12]		2.33x 	 27407.98 	 63835.12
> luma_hpp[12x16]		2.26x 	 19220.64 	 43485.26
> luma_hps[12x16]		1.92x 	 25738.31 	 49392.66
> luma_hpp[ 16x4]		2.82x 	 6157.95  	 17389.13
> luma_hps[ 16x4]		2.32x 	 15962.58 	 37061.99
> luma_hpp[ 4x16]		2.15x 	 7188.05  	 15453.23
> luma_hps[ 4x16]		1.83x 	 9628.12  	 17630.33
> luma_hpp[32x24]		2.59x 	 72228.04 	 187162.30
> luma_hps[32x24]		2.23x 	 88585.02 	 197808.91
> luma_hpp[24x32]		2.63x 	 72044.62 	 189704.78
> luma_hps[24x32]		2.25x 	 83849.13 	 188660.86
> luma_hpp[ 32x8]		2.60x 	 24121.99 	 62621.62
> luma_hps[ 32x8]		2.22x 	 43095.41 	 95508.37
> luma_hpp[ 8x32]		2.40x 	 24388.66 	 58462.95
> luma_hps[ 8x32]		1.97x 	 28271.80 	 55649.31
> luma_hpp[64x48]		2.64x 	 286337.97 	 756953.81
> luma_hps[64x48]		2.19x 	 313365.66 	 687777.12
> luma_hpp[48x64]		2.38x 	 310695.75 	 740744.94
> luma_hps[48x64]		2.21x 	 302819.50 	 668183.75
> luma_hpp[64x16]		2.58x 	 95900.21 	 247758.72
> luma_hps[64x16]		2.17x 	 131831.64 	 286192.53
> luma_hpp[16x64]		2.70x 	 95992.55 	 258870.31
> luma_hps[16x64]		2.37x 	 101104.82 	 239193.94
> 
> diff -r 68a13226d586 -r 027aaded20e7 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Sat Apr 25 01:39:55 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp	Mon Apr 27 21:21:46 2015 -0700
> @@ -1340,6 +1340,10 @@
>          CHROMA_420_VSP_FILTERS(_sse2);
>          CHROMA_422_VSP_FILTERS(_sse2);
>          CHROMA_444_VSP_FILTERS(_sse2);
> +        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
> +        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> +        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> +        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
>  
>          //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>          p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> diff -r 68a13226d586 -r 027aaded20e7 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm	Sat Apr 25 01:39:55 2015 -0500
> +++ b/source/common/x86/ipfilter8.asm	Mon Apr 27 21:21:46 2015 -0700
> @@ -160,6 +160,11 @@
>                   db  -1, 4, -11, 40,  40, -11, 4, -1
>                   db   0, 1, -5,  17,  58, -10, 4, -1
>  
> +tabw_LumaCoeff:  dw   0, 0,  0,  64,  0,   0,  0,  0
> +                 dw  -1, 4, -10, 58,  17, -5,  1,  0
> +                 dw  -1, 4, -11, 40,  40, -11, 4, -1
> +                 dw   0, 1, -5,  17,  58, -10, 4, -1
> +
>  tab_LumaCoeffV: times 4 dw 0, 0
>                  times 4 dw 0, 64
>                  times 4 dw 0, 0
> @@ -825,6 +830,229 @@
>      IPFILTER_CHROMA_W_sse3 48, 64
>      IPFILTER_CHROMA_W_sse3 64, 16
>  
> +%macro FILTER_H8_W8_sse2 0
> +    movh        m1, [r0 + r5 - 3]
> +    movh        m4, [r0 + r5 - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + r5 - 1]
> +    movh        m0, [r0 + r5]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m1, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m5, m0
> +    psrldq      m1, 2
> +    psrldq      m5, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m1, m5
> +    movh        m7, [r0 + r5 + 1]
> +    movh        m4, [r0 + r5 + 2]
> +    punpcklbw   m7, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + r5 + 3]
> +    movh        m0, [r0 + r5 + 4]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m7, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m7, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m7, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m7, m4
> +    paddw       m5, m0
> +    psrldq      m7, 2
> +    psrldq      m5, 2
> +    pshufd      m7, m7, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m7, m5
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m7, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m7, m0
> +    psrldq      m1, 2
> +    psrldq      m7, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m7, m7, q3120
> +    punpcklqdq  m1, m7
> +%endmacro
> +
> +%macro FILTER_H8_W4_sse2 0
> +    movh        m1, [r0 + r5 - 3]
> +    movh        m0, [r0 + r5 - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m0, m6
> +    movh        m4, [r0 + r5 - 1]
> +    movh        m5, [r0 + r5]
> +    punpcklbw   m4, m6
> +    punpcklbw   m5, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m0, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    packssdw    m1, m0
> +    packssdw    m4, m5
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    pshuflw     m5, m4, q2301
> +    pshufhw     m5, m5, q2301
> +    paddw       m1, m0
> +    paddw       m4, m5
> +    psrldq      m1, 2
> +    psrldq      m4, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m4, m4, q3120
> +    punpcklqdq  m1, m4
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m0
> +    psrldq      m1, 2
> +    pshufd      m1, m1, q3120
> +%endmacro
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +%macro IPFILTER_LUMA_sse2 3
> +INIT_XMM sse2
> +cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
> +
> +    mov       r4d, r4m
> +    add       r4d, r4d
> +    pxor      m6, m6
> +%ifdef PIC
> +    lea       r6, [tabw_LumaCoeff]
> +    mova      m3, [r6 + r4 * 8]
> +%else
> +    mova      m3, [tabw_LumaCoeff + r4 * 8]
> +%endif
> +
> +    mov       r4d, %2
> +%ifidn %3, pp
> +    mova      m2, [pw_32]
> +%else
> +    mova      m2, [pw_2000]
> +    add       r3d, r3d
> +    cmp       r5m, byte 0
> +    je        .loopH
> +    lea       r6, [r1 + 2 * r1]
> +    sub       r0d, r6d
> +    add       r4d, 7
> +%endif
> +
> +.loopH:
> +    xor       r5d, r5d
> +%rep %1 / 8
> +    FILTER_H8_W8_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movh      [r2 + r5], m1
> +  %else
> +    psubw     m1, m2
> +    movu      [r2 + 2 * r5], m1
> +  %endif
> +    add       r5d, 8
> +%endrep
> +
> +%rep (%1 % 8) / 4
> +    FILTER_H8_W4_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movd      [r2 + r5], m1
> +  %else
> +    psubw     m1, m2
> +    movh      [r2 + 2 * r5], m1
> +  %endif
> +%endrep
> +
> +    add       r0d, r1d
> +    add       r2d, r3d
> +
> +    dec       r4d
> +    jnz       .loopH
> +    RET
> +
> +%endmacro
> +
> +;--------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> +;--------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, pp
> +    IPFILTER_LUMA_sse2 4, 8, pp
> +    IPFILTER_LUMA_sse2 8, 4, pp
> +    IPFILTER_LUMA_sse2 8, 8, pp
> +    IPFILTER_LUMA_sse2 16, 16, pp
> +    IPFILTER_LUMA_sse2 16, 8, pp
> +    IPFILTER_LUMA_sse2 8, 16, pp
> +    IPFILTER_LUMA_sse2 16, 12, pp
> +    IPFILTER_LUMA_sse2 12, 16, pp
> +    IPFILTER_LUMA_sse2 16, 4, pp
> +    IPFILTER_LUMA_sse2 4, 16, pp
> +    IPFILTER_LUMA_sse2 32, 32, pp
> +    IPFILTER_LUMA_sse2 32, 16, pp
> +    IPFILTER_LUMA_sse2 16, 32, pp
> +    IPFILTER_LUMA_sse2 32, 24, pp
> +    IPFILTER_LUMA_sse2 24, 32, pp
> +    IPFILTER_LUMA_sse2 32, 8, pp
> +    IPFILTER_LUMA_sse2 8, 32, pp
> +    IPFILTER_LUMA_sse2 64, 64, pp
> +    IPFILTER_LUMA_sse2 64, 32, pp
> +    IPFILTER_LUMA_sse2 32, 64, pp
> +    IPFILTER_LUMA_sse2 64, 48, pp
> +    IPFILTER_LUMA_sse2 48, 64, pp
> +    IPFILTER_LUMA_sse2 64, 16, pp
> +    IPFILTER_LUMA_sse2 16, 64, pp
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, ps
> +    IPFILTER_LUMA_sse2 8, 8, ps
> +    IPFILTER_LUMA_sse2 8, 4, ps
> +    IPFILTER_LUMA_sse2 4, 8, ps
> +    IPFILTER_LUMA_sse2 16, 16, ps
> +    IPFILTER_LUMA_sse2 16, 8, ps
> +    IPFILTER_LUMA_sse2 8, 16, ps
> +    IPFILTER_LUMA_sse2 16, 12, ps
> +    IPFILTER_LUMA_sse2 12, 16, ps
> +    IPFILTER_LUMA_sse2 16, 4, ps
> +    IPFILTER_LUMA_sse2 4, 16, ps
> +    IPFILTER_LUMA_sse2 32, 32, ps
> +    IPFILTER_LUMA_sse2 32, 16, ps
> +    IPFILTER_LUMA_sse2 16, 32, ps
> +    IPFILTER_LUMA_sse2 32, 24, ps
> +    IPFILTER_LUMA_sse2 24, 32, ps
> +    IPFILTER_LUMA_sse2 32, 8, ps
> +    IPFILTER_LUMA_sse2 8, 32, ps
> +    IPFILTER_LUMA_sse2 64, 64, ps
> +    IPFILTER_LUMA_sse2 64, 32, ps
> +    IPFILTER_LUMA_sse2 32, 64, ps
> +    IPFILTER_LUMA_sse2 64, 48, ps
> +    IPFILTER_LUMA_sse2 48, 64, ps
> +    IPFILTER_LUMA_sse2 64, 16, ps
> +    IPFILTER_LUMA_sse2 16, 64, ps
> +
>  ;-----------------------------------------------------------------------------
>  ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>  ;-----------------------------------------------------------------------------
> diff -r 68a13226d586 -r 027aaded20e7 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h	Sat Apr 25 01:39:55 2015 -0500
> +++ b/source/common/x86/ipfilter8.h	Mon Apr 27 21:21:46 2015 -0700
> @@ -846,6 +846,56 @@
>  void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_12x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x12_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_24x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x24_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_48x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x48_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_ps_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
>  #undef LUMA_FILTERS
>  #undef LUMA_SP_FILTERS
>  #undef LUMA_SS_FILTERS
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list