[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

Steve Borho steve at borho.org
Fri May 1 21:22:45 CEST 2015


On 05/01, dtyx265 at gmail.com wrote:
> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1430505170 25200
> # Node ID 705b796531bb7c83c908df396ecac44ed007f642
> # Parent  bca33880585aec616107a8232204dbcb148f6678
> asm: interp_8tap_horiz pp and ps sse2
> 
> This replaces c code and covers
> 
> 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32, 16x64,
> 24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64
> 
> 64-bit

ok, this one is queued ahead of the other two. so far so good with smoke
tests

> ./test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]		1.93x 	 1785.21  	 3441.77
> luma_hps[  4x4]		1.85x 	 4487.62  	 8315.70
> luma_hpp[  8x8]		2.20x 	 6085.18  	 13358.46
> luma_hps[  8x8]		1.93x 	 10968.99 	 21135.98
> luma_hpp[16x16]		2.55x 	 23908.50 	 60945.62
> luma_hps[16x16]		2.27x 	 32943.21 	 74795.46
> luma_hpp[32x32]		2.55x 	 94576.35 	 240974.75
> luma_hps[32x32]		2.21x 	 110731.82 	 244953.02
> luma_hpp[64x64]		2.52x 	 376616.72 	 950787.38
> luma_hps[64x64]		2.30x 	 401879.16 	 924680.88
> luma_hpp[  8x4]		2.37x 	 3025.64  	 7175.84
> luma_hps[  8x4]		1.92x 	 8122.71  	 15575.27
> luma_hpp[  4x8]		1.94x 	 3562.64  	 6902.68
> luma_hps[  4x8]		1.85x 	 6087.67  	 11235.14
> luma_hpp[ 16x8]		2.56x 	 11901.34 	 30423.46
> luma_hps[ 16x8]		2.27x 	 21591.06 	 49091.93
> luma_hpp[ 8x16]		2.26x 	 12082.94 	 27258.80
> luma_hps[ 8x16]		1.94x 	 16595.28 	 32255.00
> luma_hpp[32x16]		2.53x 	 47407.80 	 120014.57
> luma_hps[32x16]		2.21x 	 65504.00 	 144679.64
> luma_hpp[16x32]		2.55x 	 47504.11 	 121076.39
> luma_hps[16x32]		2.29x 	 55546.89 	 127202.02
> luma_hpp[64x32]		2.54x 	 187763.11 	 476195.81
> luma_hps[64x32]		2.30x 	 221199.25 	 509145.91
> luma_hpp[32x64]		2.56x 	 188414.62 	 481738.12
> luma_hps[32x64]		2.21x 	 201473.38 	 445438.25
> luma_hpp[16x12]		2.54x 	 17977.88 	 45582.48
> luma_hps[16x12]		2.28x 	 27250.61 	 62118.11
> luma_hpp[12x16]		2.34x 	 18927.60 	 44255.13
> luma_hps[12x16]		1.89x 	 25587.42 	 48416.81
> luma_hpp[ 16x4]		2.56x 	 5973.04  	 15269.98
> luma_hps[ 16x4]		2.26x 	 15887.99 	 35915.50
> luma_hpp[ 4x16]		2.07x 	 7217.70  	 14905.61
> luma_hps[ 4x16]		1.87x 	 9127.53  	 17075.28
> luma_hpp[32x24]		2.54x 	 70878.05 	 180289.80
> luma_hps[32x24]		2.21x 	 88167.30 	 194824.70
> luma_hpp[24x32]		2.56x 	 70795.49 	 181563.86
> luma_hps[24x32]		2.23x 	 83265.12 	 185663.55
> luma_hpp[ 32x8]		2.55x 	 23584.57 	 60200.72
> luma_hps[ 32x8]		2.21x 	 42744.65 	 94329.84
> luma_hpp[ 8x32]		2.41x 	 23956.56 	 57703.53
> luma_hps[ 8x32]		1.96x 	 28049.62 	 54929.88
> luma_hpp[64x48]		2.56x 	 281883.84 	 721026.38
> luma_hps[64x48]		2.30x 	 311796.88 	 716619.81
> luma_hpp[48x64]		2.53x 	 281606.78 	 713136.94
> luma_hps[48x64]		2.20x 	 301767.56 	 662826.19
> luma_hpp[64x16]		2.53x 	 94093.50 	 237928.16
> luma_hps[64x16]		2.29x 	 131499.05 	 300629.41
> luma_hpp[16x64]		2.57x 	 94736.50 	 243494.28
> luma_hps[16x64]		2.31x 	 101098.77 	 233989.20
> 
> 32-bit
> 
> ./test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]		1.93x 	 1785.24  	 3441.14
> luma_hps[  4x4]		1.85x 	 4487.93  	 8315.42
> luma_hpp[  8x8]		2.21x 	 6047.94  	 13358.18
> luma_hps[  8x8]		1.93x 	 10942.52 	 21136.20
> luma_hpp[16x16]		2.56x 	 23917.82 	 61177.23
> luma_hps[16x16]		2.28x 	 32885.36 	 75134.07
> luma_hpp[32x32]		2.56x 	 94386.95 	 241838.64
> luma_hps[32x32]		2.21x 	 110783.13 	 245033.38
> luma_hpp[64x64]		2.53x 	 375728.38 	 951385.19
> luma_hps[64x64]		2.30x 	 402418.16 	 925132.25
> luma_hpp[  8x4]		2.37x 	 3025.71  	 7176.25
> luma_hps[  8x4]		1.92x 	 8122.72  	 15575.27
> luma_hpp[  4x8]		1.94x 	 3562.75  	 6903.02
> luma_hps[  4x8]		1.88x 	 5967.75  	 11235.42
> luma_hpp[ 16x8]		2.55x 	 11910.71 	 30372.57
> luma_hps[ 16x8]		2.27x 	 21521.60 	 48876.16
> luma_hpp[ 8x16]		2.21x 	 12188.04 	 26989.18
> luma_hps[ 8x16]		1.96x 	 16621.85 	 32579.98
> luma_hpp[32x16]		2.54x 	 47375.83 	 120352.38
> luma_hps[32x16]		2.21x 	 65398.98 	 144650.23
> luma_hpp[16x32]		2.54x 	 47576.41 	 121038.56
> luma_hps[16x32]		2.29x 	 55665.23 	 127244.27
> luma_hpp[64x32]		2.53x 	 188135.53 	 476050.16
> luma_hps[64x32]		2.29x 	 222327.25 	 508834.03
> luma_hpp[32x64]		2.56x 	 188590.23 	 482111.97
> luma_hps[32x64]		2.21x 	 201558.94 	 446036.34
> luma_hpp[16x12]		2.53x 	 18031.92 	 45533.77
> luma_hps[16x12]		2.27x 	 27192.16 	 61836.39
> luma_hpp[12x16]		2.34x 	 18921.93 	 44205.55
> luma_hps[12x16]		1.87x 	 25719.06 	 48125.95
> luma_hpp[ 16x4]		2.55x 	 5973.22  	 15212.72
> luma_hps[ 16x4]		2.26x 	 15882.92 	 35915.98
> luma_hpp[ 4x16]		2.07x 	 7217.61  	 14905.33
> luma_hps[ 4x16]		1.87x 	 9127.53  	 17075.68
> luma_hpp[32x24]		2.54x 	 70891.50 	 180249.72
> luma_hps[32x24]		2.21x 	 88216.69 	 194942.09
> luma_hpp[24x32]		2.57x 	 70934.99 	 182461.02
> luma_hps[24x32]		2.23x 	 83228.09 	 185491.73
> luma_hpp[ 32x8]		2.56x 	 23528.41 	 60156.14
> luma_hps[ 32x8]		2.20x 	 42846.86 	 94353.64
> luma_hpp[ 8x32]		2.40x 	 24028.00 	 57627.60
> luma_hps[ 8x32]		1.95x 	 28035.82 	 54596.18
> luma_hpp[64x48]		2.52x 	 283397.28 	 714446.75
> luma_hps[64x48]		2.30x 	 312186.34 	 717680.06
> luma_hpp[48x64]		2.53x 	 282245.50 	 713527.88
> luma_hps[48x64]		2.22x 	 302935.97 	 671615.19
> luma_hpp[64x16]		2.52x 	 94195.39 	 237710.97
> luma_hps[64x16]		2.29x 	 131054.00 	 300285.38
> luma_hpp[16x64]		2.57x 	 94785.75 	 243506.12
> luma_hps[16x64]		2.29x 	 100971.30 	 231653.41
> 
> diff -r bca33880585a -r 705b796531bb source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Sat Apr 25 00:41:25 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp	Fri May 01 11:32:50 2015 -0700
> @@ -1343,6 +1343,11 @@
>          CHROMA_422_VSP_FILTERS(_sse2);
>          CHROMA_444_VSP_FILTERS(_sse2);
>  
> +        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
> +        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> +        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> +        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> +
>          //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>          p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>  
> diff -r bca33880585a -r 705b796531bb source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm	Sat Apr 25 00:41:25 2015 -0500
> +++ b/source/common/x86/ipfilter8.asm	Fri May 01 11:32:50 2015 -0700
> @@ -151,6 +151,11 @@
>                         db  -1, 4, -11, 40,  40, -11, 4, -1
>                         db   0, 1, -5,  17,  58, -10, 4, -1
>  
> +const tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
> +                       dw  -1, 4, -10, 58,  17, -5,  1,  0
> +                       dw  -1, 4, -11, 40,  40, -11, 4, -1
> +                       dw   0, 1, -5,  17,  58, -10, 4, -1
> +
>  const tab_LumaCoeffV,   times 4 dw 0, 0
>                          times 4 dw 0, 64
>                          times 4 dw 0, 0
> @@ -807,6 +812,233 @@
>      IPFILTER_CHROMA_W_sse3 48, 64
>      IPFILTER_CHROMA_W_sse3 64, 16
>  
> +%macro FILTER_H8_W8_sse2 0
> +    movh        m1, [r0 + x - 3]
> +    movh        m4, [r0 + x - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + x - 1]
> +    movh        m0, [r0 + x]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m1, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m5, m0
> +    psrldq      m1, 2
> +    psrldq      m5, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m1, m5
> +    movh        m7, [r0 + x + 1]
> +    movh        m4, [r0 + x + 2]
> +    punpcklbw   m7, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + x + 3]
> +    movh        m0, [r0 + x + 4]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m7, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m7, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m7, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m7, m4
> +    paddw       m5, m0
> +    psrldq      m7, 2
> +    psrldq      m5, 2
> +    pshufd      m7, m7, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m7, m5
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m7, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m7, m0
> +    psrldq      m1, 2
> +    psrldq      m7, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m7, m7, q3120
> +    punpcklqdq  m1, m7
> +%endmacro
> +
> +%macro FILTER_H8_W4_sse2 0
> +    movh        m1, [r0 + x - 3]
> +    movh        m0, [r0 + x - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m0, m6
> +    movh        m4, [r0 + x - 1]
> +    movh        m5, [r0 + x]
> +    punpcklbw   m4, m6
> +    punpcklbw   m5, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m0, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    packssdw    m1, m0
> +    packssdw    m4, m5
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    pshuflw     m5, m4, q2301
> +    pshufhw     m5, m5, q2301
> +    paddw       m1, m0
> +    paddw       m4, m5
> +    psrldq      m1, 2
> +    psrldq      m4, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m4, m4, q3120
> +    punpcklqdq  m1, m4
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m0
> +    psrldq      m1, 2
> +    pshufd      m1, m1, q3120
> +%endmacro
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +%macro IPFILTER_LUMA_sse2 3
> +INIT_XMM sse2
> +cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
> +    mov       r4d, r4m
> +    add       r4d, r4d
> +    pxor      m6, m6
> +
> +%ifidn %3, ps
> +    add       r3d, r3d
> +    cmp       r5m, byte 0
> +%endif
> +
> +%ifdef PIC
> +    lea       r5, [tabw_LumaCoeff]
> +    movu      m3, [r5 + r4 * 8]
> +%else
> +    movu      m3, [tabw_LumaCoeff + r4 * 8]
> +%endif
> +
> +    mov       r4d, %2
> +
> +%ifidn %3, pp
> +    mova      m2, [pw_32]
> +%else
> +    mova      m2, [pw_2000]
> +    je        .loopH
> +    lea       r5, [r1 + 2 * r1]
> +    sub       r0, r5
> +    add       r4d, 7
> +%endif
> +
> +.loopH:
> +%assign x 0
> +%rep %1 / 8
> +    FILTER_H8_W8_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movh      [r2 + x], m1
> +  %else
> +    psubw     m1, m2
> +    movu      [r2 + 2 * x], m1
> +  %endif
> +%assign x x+8
> +%endrep
> +
> +%rep (%1 % 8) / 4
> +    FILTER_H8_W4_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movd      [r2 + x], m1
> +  %else
> +    psubw     m1, m2
> +    movh      [r2 + 2 * x], m1
> +  %endif
> +%endrep
> +
> +    add       r0, r1
> +    add       r2, r3
> +
> +    dec       r4d
> +    jnz       .loopH
> +    RET
> +
> +%endmacro
> +
> +;--------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> +;--------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, pp
> +    IPFILTER_LUMA_sse2 4, 8, pp
> +    IPFILTER_LUMA_sse2 8, 4, pp
> +    IPFILTER_LUMA_sse2 8, 8, pp
> +    IPFILTER_LUMA_sse2 16, 16, pp
> +    IPFILTER_LUMA_sse2 16, 8, pp
> +    IPFILTER_LUMA_sse2 8, 16, pp
> +    IPFILTER_LUMA_sse2 16, 12, pp
> +    IPFILTER_LUMA_sse2 12, 16, pp
> +    IPFILTER_LUMA_sse2 16, 4, pp
> +    IPFILTER_LUMA_sse2 4, 16, pp
> +    IPFILTER_LUMA_sse2 32, 32, pp
> +    IPFILTER_LUMA_sse2 32, 16, pp
> +    IPFILTER_LUMA_sse2 16, 32, pp
> +    IPFILTER_LUMA_sse2 32, 24, pp
> +    IPFILTER_LUMA_sse2 24, 32, pp
> +    IPFILTER_LUMA_sse2 32, 8, pp
> +    IPFILTER_LUMA_sse2 8, 32, pp
> +    IPFILTER_LUMA_sse2 64, 64, pp
> +    IPFILTER_LUMA_sse2 64, 32, pp
> +    IPFILTER_LUMA_sse2 32, 64, pp
> +    IPFILTER_LUMA_sse2 64, 48, pp
> +    IPFILTER_LUMA_sse2 48, 64, pp
> +    IPFILTER_LUMA_sse2 64, 16, pp
> +    IPFILTER_LUMA_sse2 16, 64, pp
> +
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +;----------------------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, ps
> +    IPFILTER_LUMA_sse2 8, 8, ps
> +    IPFILTER_LUMA_sse2 8, 4, ps
> +    IPFILTER_LUMA_sse2 4, 8, ps
> +    IPFILTER_LUMA_sse2 16, 16, ps
> +    IPFILTER_LUMA_sse2 16, 8, ps
> +    IPFILTER_LUMA_sse2 8, 16, ps
> +    IPFILTER_LUMA_sse2 16, 12, ps
> +    IPFILTER_LUMA_sse2 12, 16, ps
> +    IPFILTER_LUMA_sse2 16, 4, ps
> +    IPFILTER_LUMA_sse2 4, 16, ps
> +    IPFILTER_LUMA_sse2 32, 32, ps
> +    IPFILTER_LUMA_sse2 32, 16, ps
> +    IPFILTER_LUMA_sse2 16, 32, ps
> +    IPFILTER_LUMA_sse2 32, 24, ps
> +    IPFILTER_LUMA_sse2 24, 32, ps
> +    IPFILTER_LUMA_sse2 32, 8, ps
> +    IPFILTER_LUMA_sse2 8, 32, ps
> +    IPFILTER_LUMA_sse2 64, 64, ps
> +    IPFILTER_LUMA_sse2 64, 32, ps
> +    IPFILTER_LUMA_sse2 32, 64, ps
> +    IPFILTER_LUMA_sse2 64, 48, ps
> +    IPFILTER_LUMA_sse2 48, 64, ps
> +    IPFILTER_LUMA_sse2 64, 16, ps
> +    IPFILTER_LUMA_sse2 16, 64, ps
> +
>  ;-----------------------------------------------------------------------------
>  ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
>  ;-----------------------------------------------------------------------------
> diff -r bca33880585a -r 705b796531bb source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h	Sat Apr 25 00:41:25 2015 -0500
> +++ b/source/common/x86/ipfilter8.h	Fri May 01 11:32:50 2015 -0700
> @@ -854,6 +854,56 @@
>  void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_12x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x12_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_24x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x24_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_48x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x48_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_ps_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
>  #undef LUMA_FILTERS
>  #undef LUMA_SP_FILTERS
>  #undef LUMA_SS_FILTERS
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list