[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

Deepthi Nandakumar deepthi at multicorewareinc.com
Thu Apr 30 10:07:05 CEST 2015


Still crashes on windows x64

On Wed, Apr 29, 2015 at 8:54 PM, <dtyx265 at gmail.com> wrote:

> # HG changeset patch
> # User David T Yuen <dtyx265 at gmail.com>
> # Date 1430321025 25200
> # Node ID 9a1b8b71bc997547044f42992e1eb7f3572f03f1
> # Parent  e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
> asm: interp_8tap_horiz pp and ps sse2
>
> This replaces c code and covers
>
> 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32,
> 16x64,
> 24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64
>
> 64-bit
>
> ./test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]         1.93x    1785.21         3441.77
> luma_hps[  4x4]         1.85x    4487.62         8315.70
> luma_hpp[  8x8]         2.20x    6085.18         13358.46
> luma_hps[  8x8]         1.93x    10968.99        21135.98
> luma_hpp[16x16]         2.55x    23908.50        60945.62
> luma_hps[16x16]         2.27x    32943.21        74795.46
> luma_hpp[32x32]         2.55x    94576.35        240974.75
> luma_hps[32x32]         2.21x    110731.82       244953.02
> luma_hpp[64x64]         2.52x    376616.72       950787.38
> luma_hps[64x64]         2.30x    401879.16       924680.88
> luma_hpp[  8x4]         2.37x    3025.64         7175.84
> luma_hps[  8x4]         1.92x    8122.71         15575.27
> luma_hpp[  4x8]         1.94x    3562.64         6902.68
> luma_hps[  4x8]         1.85x    6087.67         11235.14
> luma_hpp[ 16x8]         2.56x    11901.34        30423.46
> luma_hps[ 16x8]         2.27x    21591.06        49091.93
> luma_hpp[ 8x16]         2.26x    12082.94        27258.80
> luma_hps[ 8x16]         1.94x    16595.28        32255.00
> luma_hpp[32x16]         2.53x    47407.80        120014.57
> luma_hps[32x16]         2.21x    65504.00        144679.64
> luma_hpp[16x32]         2.55x    47504.11        121076.39
> luma_hps[16x32]         2.29x    55546.89        127202.02
> luma_hpp[64x32]         2.54x    187763.11       476195.81
> luma_hps[64x32]         2.30x    221199.25       509145.91
> luma_hpp[32x64]         2.56x    188414.62       481738.12
> luma_hps[32x64]         2.21x    201473.38       445438.25
> luma_hpp[16x12]         2.54x    17977.88        45582.48
> luma_hps[16x12]         2.28x    27250.61        62118.11
> luma_hpp[12x16]         2.34x    18927.60        44255.13
> luma_hps[12x16]         1.89x    25587.42        48416.81
> luma_hpp[ 16x4]         2.56x    5973.04         15269.98
> luma_hps[ 16x4]         2.26x    15887.99        35915.50
> luma_hpp[ 4x16]         2.07x    7217.70         14905.61
> luma_hps[ 4x16]         1.87x    9127.53         17075.28
> luma_hpp[32x24]         2.54x    70878.05        180289.80
> luma_hps[32x24]         2.21x    88167.30        194824.70
> luma_hpp[24x32]         2.56x    70795.49        181563.86
> luma_hps[24x32]         2.23x    83265.12        185663.55
> luma_hpp[ 32x8]         2.55x    23584.57        60200.72
> luma_hps[ 32x8]         2.21x    42744.65        94329.84
> luma_hpp[ 8x32]         2.41x    23956.56        57703.53
> luma_hps[ 8x32]         1.96x    28049.62        54929.88
> luma_hpp[64x48]         2.56x    281883.84       721026.38
> luma_hps[64x48]         2.30x    311796.88       716619.81
> luma_hpp[48x64]         2.53x    281606.78       713136.94
> luma_hps[48x64]         2.20x    301767.56       662826.19
> luma_hpp[64x16]         2.53x    94093.50        237928.16
> luma_hps[64x16]         2.29x    131499.05       300629.41
> luma_hpp[16x64]         2.57x    94736.50        243494.28
> luma_hps[16x64]         2.31x    101098.77       233989.20
>
> 32-bit
>
> ./test/TestBench --testbench interp | grep luma_h
> luma_hpp[  4x4]         1.93x    1785.24         3441.14
> luma_hps[  4x4]         1.85x    4487.93         8315.42
> luma_hpp[  8x8]         2.21x    6047.94         13358.18
> luma_hps[  8x8]         1.93x    10942.52        21136.20
> luma_hpp[16x16]         2.56x    23917.82        61177.23
> luma_hps[16x16]         2.28x    32885.36        75134.07
> luma_hpp[32x32]         2.56x    94386.95        241838.64
> luma_hps[32x32]         2.21x    110783.13       245033.38
> luma_hpp[64x64]         2.53x    375728.38       951385.19
> luma_hps[64x64]         2.30x    402418.16       925132.25
> luma_hpp[  8x4]         2.37x    3025.71         7176.25
> luma_hps[  8x4]         1.92x    8122.72         15575.27
> luma_hpp[  4x8]         1.94x    3562.75         6903.02
> luma_hps[  4x8]         1.88x    5967.75         11235.42
> luma_hpp[ 16x8]         2.55x    11910.71        30372.57
> luma_hps[ 16x8]         2.27x    21521.60        48876.16
> luma_hpp[ 8x16]         2.21x    12188.04        26989.18
> luma_hps[ 8x16]         1.96x    16621.85        32579.98
> luma_hpp[32x16]         2.54x    47375.83        120352.38
> luma_hps[32x16]         2.21x    65398.98        144650.23
> luma_hpp[16x32]         2.54x    47576.41        121038.56
> luma_hps[16x32]         2.29x    55665.23        127244.27
> luma_hpp[64x32]         2.53x    188135.53       476050.16
> luma_hps[64x32]         2.29x    222327.25       508834.03
> luma_hpp[32x64]         2.56x    188590.23       482111.97
> luma_hps[32x64]         2.21x    201558.94       446036.34
> luma_hpp[16x12]         2.53x    18031.92        45533.77
> luma_hps[16x12]         2.27x    27192.16        61836.39
> luma_hpp[12x16]         2.34x    18921.93        44205.55
> luma_hps[12x16]         1.87x    25719.06        48125.95
> luma_hpp[ 16x4]         2.55x    5973.22         15212.72
> luma_hps[ 16x4]         2.26x    15882.92        35915.98
> luma_hpp[ 4x16]         2.07x    7217.61         14905.33
> luma_hps[ 4x16]         1.87x    9127.53         17075.68
> luma_hpp[32x24]         2.54x    70891.50        180249.72
> luma_hps[32x24]         2.21x    88216.69        194942.09
> luma_hpp[24x32]         2.57x    70934.99        182461.02
> luma_hps[24x32]         2.23x    83228.09        185491.73
> luma_hpp[ 32x8]         2.56x    23528.41        60156.14
> luma_hps[ 32x8]         2.20x    42846.86        94353.64
> luma_hpp[ 8x32]         2.40x    24028.00        57627.60
> luma_hps[ 8x32]         1.95x    28035.82        54596.18
> luma_hpp[64x48]         2.52x    283397.28       714446.75
> luma_hps[64x48]         2.30x    312186.34       717680.06
> luma_hpp[48x64]         2.53x    282245.50       713527.88
> luma_hps[48x64]         2.22x    302935.97       671615.19
> luma_hpp[64x16]         2.52x    94195.39        237710.97
> luma_hps[64x16]         2.29x    131054.00       300285.38
> luma_hpp[16x64]         2.57x    94785.75        243506.12
> luma_hps[16x64]         2.29x    100971.30       231653.41
>
> diff -r e9df93f38066 -r 9a1b8b71bc99 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Tue Apr 28 20:24:06 2015
> +0800
> +++ b/source/common/x86/asm-primitives.cpp      Wed Apr 29 08:23:45 2015
> -0700
> @@ -1343,6 +1343,11 @@
>          CHROMA_422_VSP_FILTERS(_sse2);
>          CHROMA_444_VSP_FILTERS(_sse2);
>
> +        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
> +        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> +        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> +        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> +
>          //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
>          p.frameInitLowres = x265_frame_init_lowres_core_sse2;
>
> diff -r e9df93f38066 -r 9a1b8b71bc99 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Tue Apr 28 20:24:06 2015 +0800
> +++ b/source/common/x86/ipfilter8.asm   Wed Apr 29 08:23:45 2015 -0700
> @@ -151,6 +151,11 @@
>                         db  -1, 4, -11, 40,  40, -11, 4, -1
>                         db   0, 1, -5,  17,  58, -10, 4, -1
>
> +const tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
> +                       dw  -1, 4, -10, 58,  17, -5,  1,  0
> +                       dw  -1, 4, -11, 40,  40, -11, 4, -1
> +                       dw   0, 1, -5,  17,  58, -10, 4, -1
> +
>  const tab_LumaCoeffV,   times 4 dw 0, 0
>                          times 4 dw 0, 64
>                          times 4 dw 0, 0
> @@ -807,6 +812,233 @@
>      IPFILTER_CHROMA_W_sse3 48, 64
>      IPFILTER_CHROMA_W_sse3 64, 16
>
> +%macro FILTER_H8_W8_sse2 0
> +    movh        m1, [r0 + x - 3]
> +    movh        m4, [r0 + x - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + x - 1]
> +    movh        m0, [r0 + x]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m1, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m5, m0
> +    psrldq      m1, 2
> +    psrldq      m5, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m1, m5
> +    movh        m7, [r0 + x + 1]
> +    movh        m4, [r0 + x + 2]
> +    punpcklbw   m7, m6
> +    punpcklbw   m4, m6
> +    movh        m5, [r0 + x + 3]
> +    movh        m0, [r0 + x + 4]
> +    punpcklbw   m5, m6
> +    punpcklbw   m0, m6
> +    pmaddwd     m7, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    pmaddwd     m0, m3
> +    packssdw    m7, m4
> +    packssdw    m5, m0
> +    pshuflw     m4, m7, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m5, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m7, m4
> +    paddw       m5, m0
> +    psrldq      m7, 2
> +    psrldq      m5, 2
> +    pshufd      m7, m7, q3120
> +    pshufd      m5, m5, q3120
> +    punpcklqdq  m7, m5
> +    pshuflw     m4, m1, q2301
> +    pshufhw     m4, m4, q2301
> +    pshuflw     m0, m7, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m4
> +    paddw       m7, m0
> +    psrldq      m1, 2
> +    psrldq      m7, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m7, m7, q3120
> +    punpcklqdq  m1, m7
> +%endmacro
> +
> +%macro FILTER_H8_W4_sse2 0
> +    movh        m1, [r0 + x - 3]
> +    movh        m0, [r0 + x - 2]
> +    punpcklbw   m1, m6
> +    punpcklbw   m0, m6
> +    movh        m4, [r0 + x - 1]
> +    movh        m5, [r0 + x]
> +    punpcklbw   m4, m6
> +    punpcklbw   m5, m6
> +    pmaddwd     m1, m3
> +    pmaddwd     m0, m3
> +    pmaddwd     m4, m3
> +    pmaddwd     m5, m3
> +    packssdw    m1, m0
> +    packssdw    m4, m5
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    pshuflw     m5, m4, q2301
> +    pshufhw     m5, m5, q2301
> +    paddw       m1, m0
> +    paddw       m4, m5
> +    psrldq      m1, 2
> +    psrldq      m4, 2
> +    pshufd      m1, m1, q3120
> +    pshufd      m4, m4, q3120
> +    punpcklqdq  m1, m4
> +    pshuflw     m0, m1, q2301
> +    pshufhw     m0, m0, q2301
> +    paddw       m1, m0
> +    psrldq      m1, 2
> +    pshufd      m1, m1, q3120
> +%endmacro
> +
>
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>
> +;----------------------------------------------------------------------------------------------------------------------------
> +%macro IPFILTER_LUMA_sse2 3
> +INIT_XMM sse2
> +cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
> +    mov       r4d, r4m
> +    add       r4d, r4d
> +    pxor      m6, m6
> +
> +%ifidn %3, ps
> +    add       r3d, r3d
> +    cmp       r5m, byte 0
> +%endif
> +
> +%ifdef PIC
> +    lea       r5, [tabw_LumaCoeff]
> +    movu      m3, [r5 + r4 * 8]
> +%else
> +    movu      m3, [tabw_LumaCoeff + r4 * 8]
> +%endif
> +
> +    mov       r4d, %2
> +
> +%ifidn %3, pp
> +    mova      m2, [pw_32]
> +%else
> +    mova      m2, [pw_2000]
> +    je        .loopH
> +    lea       r5, [r1 + 2 * r1]
> +    sub       r0d, r5d
> +    add       r4d, 7
> +%endif
> +
> +.loopH:
> +%assign x 0
> +%rep %1 / 8
> +    FILTER_H8_W8_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movh      [r2 + x], m1
> +  %else
> +    psubw     m1, m2
> +    movu      [r2 + 2 * x], m1
> +  %endif
> +%assign x x+8
> +%endrep
> +
> +%rep (%1 % 8) / 4
> +    FILTER_H8_W4_sse2
> +  %ifidn %3, pp
> +    paddw     m1, m2
> +    psraw     m1, 6
> +    packuswb  m1, m1
> +    movd      [r2 + x], m1
> +  %else
> +    psubw     m1, m2
> +    movh      [r2 + 2 * x], m1
> +  %endif
> +%endrep
> +
> +    add       r0, r1
> +    add       r2, r3
> +
> +    dec       r4d
> +    jnz       .loopH
> +    RET
> +
> +%endmacro
> +
>
> +;--------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx)
>
> +;--------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, pp
> +    IPFILTER_LUMA_sse2 4, 8, pp
> +    IPFILTER_LUMA_sse2 8, 4, pp
> +    IPFILTER_LUMA_sse2 8, 8, pp
> +    IPFILTER_LUMA_sse2 16, 16, pp
> +    IPFILTER_LUMA_sse2 16, 8, pp
> +    IPFILTER_LUMA_sse2 8, 16, pp
> +    IPFILTER_LUMA_sse2 16, 12, pp
> +    IPFILTER_LUMA_sse2 12, 16, pp
> +    IPFILTER_LUMA_sse2 16, 4, pp
> +    IPFILTER_LUMA_sse2 4, 16, pp
> +    IPFILTER_LUMA_sse2 32, 32, pp
> +    IPFILTER_LUMA_sse2 32, 16, pp
> +    IPFILTER_LUMA_sse2 16, 32, pp
> +    IPFILTER_LUMA_sse2 32, 24, pp
> +    IPFILTER_LUMA_sse2 24, 32, pp
> +    IPFILTER_LUMA_sse2 32, 8, pp
> +    IPFILTER_LUMA_sse2 8, 32, pp
> +    IPFILTER_LUMA_sse2 64, 64, pp
> +    IPFILTER_LUMA_sse2 64, 32, pp
> +    IPFILTER_LUMA_sse2 32, 64, pp
> +    IPFILTER_LUMA_sse2 64, 48, pp
> +    IPFILTER_LUMA_sse2 48, 64, pp
> +    IPFILTER_LUMA_sse2 64, 16, pp
> +    IPFILTER_LUMA_sse2 16, 64, pp
> +
>
> +;----------------------------------------------------------------------------------------------------------------------------
> +; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
>
> +;----------------------------------------------------------------------------------------------------------------------------
> +    IPFILTER_LUMA_sse2 4, 4, ps
> +    IPFILTER_LUMA_sse2 8, 8, ps
> +    IPFILTER_LUMA_sse2 8, 4, ps
> +    IPFILTER_LUMA_sse2 4, 8, ps
> +    IPFILTER_LUMA_sse2 16, 16, ps
> +    IPFILTER_LUMA_sse2 16, 8, ps
> +    IPFILTER_LUMA_sse2 8, 16, ps
> +    IPFILTER_LUMA_sse2 16, 12, ps
> +    IPFILTER_LUMA_sse2 12, 16, ps
> +    IPFILTER_LUMA_sse2 16, 4, ps
> +    IPFILTER_LUMA_sse2 4, 16, ps
> +    IPFILTER_LUMA_sse2 32, 32, ps
> +    IPFILTER_LUMA_sse2 32, 16, ps
> +    IPFILTER_LUMA_sse2 16, 32, ps
> +    IPFILTER_LUMA_sse2 32, 24, ps
> +    IPFILTER_LUMA_sse2 24, 32, ps
> +    IPFILTER_LUMA_sse2 32, 8, ps
> +    IPFILTER_LUMA_sse2 8, 32, ps
> +    IPFILTER_LUMA_sse2 64, 64, ps
> +    IPFILTER_LUMA_sse2 64, 32, ps
> +    IPFILTER_LUMA_sse2 32, 64, ps
> +    IPFILTER_LUMA_sse2 64, 48, ps
> +    IPFILTER_LUMA_sse2 48, 64, ps
> +    IPFILTER_LUMA_sse2 64, 16, ps
> +    IPFILTER_LUMA_sse2 16, 64, ps
> +
>
>  ;-----------------------------------------------------------------------------
>  ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel
> *dst, intptr_t dstStride, int coeffIdx)
>
>  ;-----------------------------------------------------------------------------
> diff -r e9df93f38066 -r 9a1b8b71bc99 source/common/x86/ipfilter8.h
> --- a/source/common/x86/ipfilter8.h     Tue Apr 28 20:24:06 2015 +0800
> +++ b/source/common/x86/ipfilter8.h     Wed Apr 29 08:23:45 2015 -0700
> @@ -850,6 +850,56 @@
>  void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
>  void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x4_sse2(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x8_sse2(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_4x16_sse2(const pixel *src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x4_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x8_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x16_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_8x32_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_12x16_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x4_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x8_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x12_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x16_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x32_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_16x64_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_24x32_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x8_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x16_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x24_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x32_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_32x64_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_48x64_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x16_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x32_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x48_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_pp_64x64_sse2(const pixel* src, intptr_t
> srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> +void x265_interp_8tap_horiz_ps_4x4_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x8_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x4_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x8_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x32_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_12x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x4_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x8_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x12_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x32_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x64_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_24x32_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x8_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x24_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x32_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x64_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_48x64_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x16_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
>  #undef LUMA_FILTERS
>  #undef LUMA_SP_FILTERS
>  #undef LUMA_SS_FILTERS
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150430/cc997ceb/attachment-0001.html>


More information about the x265-devel mailing list