[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Wed Apr 29 04:15:23 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1430273620 25200
# Node ID 9b0181193b6a2c64dad26d1749fb6a0e6cf87240
# Parent  e9df93f380664932e7d6c7e85b2cae16cd5e1dcd
asm: interp_8tap_horiz pp and ps sse2

This replaces c code and covers

4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 8x32, 12x16, 16x8, 16x12, 16x16, 16x32, 16x64,
24x32, 32x8, 32x16, 32x24, 32x32, 32x64, 48x64, 64x16, 64x32, 64x48, 64x64

64-bit

./test/TestBench --testbench interp | grep luma_h
luma_hpp[  4x4]		1.93x 	 1785.23  	 3440.44
luma_hps[  4x4]		1.78x 	 4668.16  	 8315.97
luma_hpp[  8x8]		2.20x 	 6085.12  	 13358.18
luma_hps[  8x8]		1.93x 	 10966.76 	 21135.28
luma_hpp[16x16]		2.56x 	 23906.23 	 61087.05
luma_hps[16x16]		2.30x 	 32890.60 	 75619.12
luma_hpp[32x32]		2.59x 	 94246.86 	 244558.19
luma_hps[32x32]		2.21x 	 110899.23 	 244946.17
luma_hpp[64x64]		2.57x 	 376029.31 	 968213.06
luma_hps[64x64]		2.17x 	 426490.66 	 925922.69
luma_hpp[  8x4]		2.37x 	 3027.70  	 7175.56
luma_hps[  8x4]		1.92x 	 8132.61  	 15575.28
luma_hpp[  4x8]		1.93x 	 3569.09  	 6902.47
luma_hps[  4x8]		1.80x 	 6227.77  	 11237.01
luma_hpp[ 16x8]		2.57x 	 11910.59 	 30562.16
luma_hps[ 16x8]		2.26x 	 21583.07 	 48880.50
luma_hpp[ 8x16]		2.25x 	 12017.56 	 26988.45
luma_hps[ 8x16]		1.95x 	 16666.63 	 32456.38
luma_hpp[32x16]		2.54x 	 47397.47 	 120203.77
luma_hps[32x16]		2.20x 	 65652.99 	 144470.98
luma_hpp[16x32]		2.55x 	 47660.34 	 121469.73
luma_hps[16x32]		2.28x 	 55609.64 	 126999.02
luma_hpp[64x32]		2.55x 	 188588.95 	 481072.16
luma_hps[64x32]		2.27x 	 224091.56 	 508803.62
luma_hpp[32x64]		2.61x 	 188297.67 	 491043.78
luma_hps[32x64]		2.21x 	 202214.22 	 447216.84
luma_hpp[16x12]		2.54x 	 17978.03 	 45745.44
luma_hps[16x12]		2.30x 	 27224.89 	 62582.70
luma_hpp[12x16]		2.35x 	 18915.09 	 44403.39
luma_hps[12x16]		1.88x 	 25643.44 	 48312.90
luma_hpp[ 16x4]		2.55x 	 5968.94  	 15212.99
luma_hps[ 16x4]		2.27x 	 15878.00 	 36119.47
luma_hpp[ 4x16]		2.07x 	 7217.63  	 14905.95
luma_hps[ 4x16]		1.83x 	 9347.61  	 17077.63
luma_hpp[32x24]		2.55x 	 70702.66 	 179962.50
luma_hps[32x24]		2.21x 	 88203.63 	 194572.66
luma_hpp[24x32]		2.56x 	 70772.55 	 181361.48
luma_hps[24x32]		2.22x 	 83448.71 	 185463.17
luma_hpp[ 32x8]		2.55x 	 23670.56 	 60342.42
luma_hps[ 32x8]		2.20x 	 42789.34 	 94308.16
luma_hpp[ 8x32]		2.43x 	 23943.85 	 58233.07
luma_hps[ 8x32]		1.95x 	 27989.48 	 54596.67
luma_hpp[64x48]		2.53x 	 282044.56 	 712753.81
luma_hps[64x48]		2.29x 	 312788.00 	 717477.88
luma_hpp[48x64]		2.52x 	 282292.97 	 712709.81
luma_hps[48x64]		2.19x 	 302271.06 	 662558.75
luma_hpp[64x16]		2.52x 	 94289.28 	 237360.70
luma_hps[64x16]		2.29x 	 130756.29 	 299937.75
luma_hpp[16x64]		2.56x 	 94725.96 	 242905.03
luma_hps[16x64]		2.93x 	 100985.45 	 295985.50

32-bit

./test/TestBench --testbench interp | grep luma_h
luma_hpp[  4x4]		2.03x 	 1857.68  	 3765.88
luma_hps[  4x4]		1.86x 	 4635.35  	 8630.80
luma_hpp[  8x8]		2.35x 	 6162.62  	 14481.97
luma_hps[  8x8]		1.94x 	 11010.38 	 21393.63
luma_hpp[16x16]		2.71x 	 23932.59 	 64745.16
luma_hps[16x16]		2.32x 	 33022.04 	 76565.82
luma_hpp[32x32]		2.67x 	 94757.98 	 253195.48
luma_hps[32x32]		2.24x 	 110836.16 	 248253.83
luma_hpp[64x64]		2.68x 	 377921.97 	 1011001.75
luma_hps[64x64]		2.19x 	 404019.22 	 884795.44
luma_hpp[  8x4]		2.34x 	 3110.22  	 7265.61
luma_hps[  8x4]		1.93x 	 8190.16  	 15790.72
luma_hpp[  4x8]		2.10x 	 3637.87  	 7653.13
luma_hps[  4x8]		1.89x 	 6155.16  	 11629.97
luma_hpp[ 16x8]		2.73x 	 11997.50 	 32709.68
luma_hps[ 16x8]		2.32x 	 21585.27 	 50085.42
luma_hpp[ 8x16]		2.41x 	 12100.89 	 29205.80
luma_hps[ 8x16]		1.98x 	 16651.91 	 32906.20
luma_hpp[32x16]		2.67x 	 47288.30 	 126280.91
luma_hps[32x16]		2.23x 	 65470.68 	 146288.86
luma_hpp[16x32]		2.70x 	 47712.78 	 128709.99
luma_hps[16x32]		2.32x 	 56009.05 	 129910.73
luma_hpp[64x32]		2.59x 	 189383.70 	 491267.19
luma_hps[64x32]		2.20x 	 221908.06 	 487628.59
luma_hpp[32x64]		2.63x 	 189890.94 	 498590.69
luma_hps[32x64]		2.25x 	 202185.36 	 454678.72
luma_hpp[16x12]		2.68x 	 18059.54 	 48459.52
luma_hps[16x12]		2.33x 	 27225.24 	 63327.61
luma_hpp[12x16]		2.29x 	 18988.50 	 43505.35
luma_hps[12x16]		1.91x 	 25577.61 	 48899.88
luma_hpp[ 16x4]		2.87x 	 6060.81  	 17385.48
luma_hps[ 16x4]		2.31x 	 15945.44 	 36845.93
luma_hpp[ 4x16]		2.12x 	 7302.50  	 15455.59
luma_hps[ 4x16]		1.96x 	 8990.00  	 17630.94
luma_hpp[32x24]		2.64x 	 70776.92 	 186816.78
luma_hps[32x24]		2.23x 	 88215.06 	 197043.84
luma_hpp[24x32]		2.70x 	 71163.41 	 192404.27
luma_hps[24x32]		2.26x 	 83641.63 	 188625.94
luma_hpp[ 32x8]		2.66x 	 23679.80 	 63083.98
luma_hps[ 32x8]		2.23x 	 42923.36 	 95715.66
luma_hpp[ 8x32]		2.43x 	 24117.07 	 58612.02
luma_hps[ 8x32]		1.97x 	 28059.13 	 55385.55
luma_hpp[64x48]		2.60x 	 284101.56 	 739073.31
luma_hps[64x48]		2.20x 	 312840.66 	 688271.56
luma_hpp[48x64]		2.62x 	 283742.75 	 743053.19
luma_hps[48x64]		2.21x 	 304183.97 	 671124.94
luma_hpp[64x16]		2.59x 	 94719.38 	 245008.95
luma_hps[64x16]		2.18x 	 131164.03 	 285906.84
luma_hpp[16x64]		2.73x 	 95453.28 	 260661.61
luma_hps[16x64]		2.34x 	 101129.92 	 236882.66

diff -r e9df93f38066 -r 9b0181193b6a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 28 20:24:06 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 28 19:13:40 2015 -0700
@@ -1343,6 +1343,11 @@
         CHROMA_422_VSP_FILTERS(_sse2);
         CHROMA_444_VSP_FILTERS(_sse2);
 
+        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
+        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
+        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
+        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+
         //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
 
diff -r e9df93f38066 -r 9b0181193b6a source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 28 20:24:06 2015 +0800
+++ b/source/common/x86/ipfilter8.asm	Tue Apr 28 19:13:40 2015 -0700
@@ -151,6 +151,12 @@
                        db  -1, 4, -11, 40,  40, -11, 4, -1
                        db   0, 1, -5,  17,  58, -10, 4, -1
 
+const tabw_LumaCoeff,  dw   0, 0,  0,  64,  0,   0,  0,  0
+                       dw  -1, 4, -10, 58,  17, -5,  1,  0
+                       dw  -1, 4, -11, 40,  40, -11, 4, -1
+                       dw   0, 1, -5,  17,  58, -10, 4, -1
+
+
 const tab_LumaCoeffV,   times 4 dw 0, 0
                         times 4 dw 0, 64
                         times 4 dw 0, 0
@@ -807,6 +813,233 @@
     IPFILTER_CHROMA_W_sse3 48, 64
     IPFILTER_CHROMA_W_sse3 64, 16
 
+%macro FILTER_H8_W8_sse2 0
+    movh        m1, [r0 + x - 3]
+    movh        m4, [r0 + x - 2]
+    punpcklbw   m1, m6
+    punpcklbw   m4, m6
+    movh        m5, [r0 + x - 1]
+    movh        m0, [r0 + x]
+    punpcklbw   m5, m6
+    punpcklbw   m0, m6
+    pmaddwd     m1, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    pmaddwd     m0, m3
+    packssdw    m1, m4
+    packssdw    m5, m0
+    pshuflw     m4, m1, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m5, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m4
+    paddw       m5, m0
+    psrldq      m1, 2
+    psrldq      m5, 2
+    pshufd      m1, m1, q3120
+    pshufd      m5, m5, q3120
+    punpcklqdq  m1, m5
+    movh        m7, [r0 + x + 1]
+    movh        m4, [r0 + x + 2]
+    punpcklbw   m7, m6
+    punpcklbw   m4, m6
+    movh        m5, [r0 + x + 3]
+    movh        m0, [r0 + x + 4]
+    punpcklbw   m5, m6
+    punpcklbw   m0, m6
+    pmaddwd     m7, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    pmaddwd     m0, m3
+    packssdw    m7, m4
+    packssdw    m5, m0
+    pshuflw     m4, m7, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m5, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m7, m4
+    paddw       m5, m0
+    psrldq      m7, 2
+    psrldq      m5, 2
+    pshufd      m7, m7, q3120
+    pshufd      m5, m5, q3120
+    punpcklqdq  m7, m5
+    pshuflw     m4, m1, q2301
+    pshufhw     m4, m4, q2301
+    pshuflw     m0, m7, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m4
+    paddw       m7, m0
+    psrldq      m1, 2
+    psrldq      m7, 2
+    pshufd      m1, m1, q3120
+    pshufd      m7, m7, q3120
+    punpcklqdq  m1, m7
+%endmacro
+
+%macro FILTER_H8_W4_sse2 0
+    movh        m1, [r0 + x - 3]
+    movh        m0, [r0 + x - 2]
+    punpcklbw   m1, m6
+    punpcklbw   m0, m6
+    movh        m4, [r0 + x - 1]
+    movh        m5, [r0 + x]
+    punpcklbw   m4, m6
+    punpcklbw   m5, m6
+    pmaddwd     m1, m3
+    pmaddwd     m0, m3
+    pmaddwd     m4, m3
+    pmaddwd     m5, m3
+    packssdw    m1, m0
+    packssdw    m4, m5
+    pshuflw     m0, m1, q2301
+    pshufhw     m0, m0, q2301
+    pshuflw     m5, m4, q2301
+    pshufhw     m5, m5, q2301
+    paddw       m1, m0
+    paddw       m4, m5
+    psrldq      m1, 2
+    psrldq      m4, 2
+    pshufd      m1, m1, q3120
+    pshufd      m4, m4, q3120
+    punpcklqdq  m1, m4
+    pshuflw     m0, m1, q2301
+    pshufhw     m0, m0, q2301
+    paddw       m1, m0
+    psrldq      m1, 2
+    pshufd      m1, m1, q3120
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
+    mov       r4d, r4m
+    add       r4d, r4d
+    pxor      m6, m6
+
+%ifidn %3, ps
+    add       r3d, r3d
+    cmp       r5m, byte 0
+%endif
+
+%ifdef PIC
+    lea       r5, [tabw_LumaCoeff]
+    movu      m3, [r5 + r4 * 8]
+%else
+    movu      m3, [tabw_LumaCoeff + r4 * 8]
+%endif
+
+    mov       r4d, %2
+
+%ifidn %3, pp
+    mova      m2, [pw_32]
+%else
+    mova      m2, [pw_2000]
+    je        .loopH
+    lea       r5, [r1 + 2 * r1]
+    sub       r0d, r5d
+    add       r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1 / 8
+    FILTER_H8_W8_sse2
+  %ifidn %3, pp
+    paddw     m1, m2
+    psraw     m1, 6
+    packuswb  m1, m1
+    movh      [r2 + x], m1
+  %else
+    psubw     m1, m2
+    movu      [r2 + 2 * x], m1
+  %endif
+%assign x x+8
+%endrep
+
+%rep (%1 % 8) / 4
+    FILTER_H8_W4_sse2
+  %ifidn %3, pp
+    paddw     m1, m2
+    psraw     m1, 6
+    packuswb  m1, m1
+    movd      [r2 + x], m1
+  %else
+    psubw     m1, m2
+    movh      [r2 + 2 * x], m1
+  %endif
+%endrep
+
+    add       r0d, r1d
+    add       r2d, r3d
+
+    dec       r4d
+    jnz       .loopH
+    RET
+
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+    IPFILTER_LUMA_sse2 4, 4, pp
+    IPFILTER_LUMA_sse2 4, 8, pp
+    IPFILTER_LUMA_sse2 8, 4, pp
+    IPFILTER_LUMA_sse2 8, 8, pp
+    IPFILTER_LUMA_sse2 16, 16, pp
+    IPFILTER_LUMA_sse2 16, 8, pp
+    IPFILTER_LUMA_sse2 8, 16, pp
+    IPFILTER_LUMA_sse2 16, 12, pp
+    IPFILTER_LUMA_sse2 12, 16, pp
+    IPFILTER_LUMA_sse2 16, 4, pp
+    IPFILTER_LUMA_sse2 4, 16, pp
+    IPFILTER_LUMA_sse2 32, 32, pp
+    IPFILTER_LUMA_sse2 32, 16, pp
+    IPFILTER_LUMA_sse2 16, 32, pp
+    IPFILTER_LUMA_sse2 32, 24, pp
+    IPFILTER_LUMA_sse2 24, 32, pp
+    IPFILTER_LUMA_sse2 32, 8, pp
+    IPFILTER_LUMA_sse2 8, 32, pp
+    IPFILTER_LUMA_sse2 64, 64, pp
+    IPFILTER_LUMA_sse2 64, 32, pp
+    IPFILTER_LUMA_sse2 32, 64, pp
+    IPFILTER_LUMA_sse2 64, 48, pp
+    IPFILTER_LUMA_sse2 48, 64, pp
+    IPFILTER_LUMA_sse2 64, 16, pp
+    IPFILTER_LUMA_sse2 16, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+    IPFILTER_LUMA_sse2 4, 4, ps
+    IPFILTER_LUMA_sse2 8, 8, ps
+    IPFILTER_LUMA_sse2 8, 4, ps
+    IPFILTER_LUMA_sse2 4, 8, ps
+    IPFILTER_LUMA_sse2 16, 16, ps
+    IPFILTER_LUMA_sse2 16, 8, ps
+    IPFILTER_LUMA_sse2 8, 16, ps
+    IPFILTER_LUMA_sse2 16, 12, ps
+    IPFILTER_LUMA_sse2 12, 16, ps
+    IPFILTER_LUMA_sse2 16, 4, ps
+    IPFILTER_LUMA_sse2 4, 16, ps
+    IPFILTER_LUMA_sse2 32, 32, ps
+    IPFILTER_LUMA_sse2 32, 16, ps
+    IPFILTER_LUMA_sse2 16, 32, ps
+    IPFILTER_LUMA_sse2 32, 24, ps
+    IPFILTER_LUMA_sse2 24, 32, ps
+    IPFILTER_LUMA_sse2 32, 8, ps
+    IPFILTER_LUMA_sse2 8, 32, ps
+    IPFILTER_LUMA_sse2 64, 64, ps
+    IPFILTER_LUMA_sse2 64, 32, ps
+    IPFILTER_LUMA_sse2 32, 64, ps
+    IPFILTER_LUMA_sse2 64, 48, ps
+    IPFILTER_LUMA_sse2 48, 64, ps
+    IPFILTER_LUMA_sse2 64, 16, ps
+    IPFILTER_LUMA_sse2 16, 64, ps
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
diff -r e9df93f38066 -r 9b0181193b6a source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Apr 28 20:24:06 2015 +0800
+++ b/source/common/x86/ipfilter8.h	Tue Apr 28 19:13:40 2015 -0700
@@ -850,6 +850,56 @@
 void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_4x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_4x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_4x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_8x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_8x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_8x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_8x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_12x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x4_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x12_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_16x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_24x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_32x8_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_32x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_32x24_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_32x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_32x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_48x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_64x16_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_64x32_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_64x48_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_pp_64x64_sse2(const pixel* src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_8tap_horiz_ps_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS


More information about the x265-devel mailing list