[x265] [PATCH] asm: interp_4tap_horiz_ps sse3

dtyx265 at gmail.com dtyx265 at gmail.com
Fri May 22 04:25:07 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1432261447 25200
# Node ID 4330ef5ddfcb64b1a621149fca0a4550c2a2f36f
# Parent  234bc93bd51698801fad77cc861177ed019f5113
asm: interp_4tap_horiz_ps sse3

This replaces c code for all of interp_4tap_horiz_ps for sse3

64-bit

./test/TestBench --testbench interp | grep chroma_hps
chroma_hps[  4x4]	1.79x 	 1572.53  	 2815.09
chroma_hps[  8x8]	1.79x 	 4885.74  	 8750.62
chroma_hps[16x16]	1.78x 	 15859.17 	 28211.22
chroma_hps[32x32]	4.36x 	 57531.00 	 250840.08
chroma_hps[  4x2]	1.77x 	 1148.16  	 2035.62
chroma_hps[  2x4]	1.67x 	 950.38   	 1585.96
chroma_hps[  8x4]	1.86x 	 2972.69  	 5517.80
chroma_hps[  4x8]	1.68x 	 2695.82  	 4522.98
chroma_hps[ 16x8]	1.76x 	 9335.05  	 16452.60
chroma_hps[ 8x16]	1.82x 	 8200.17  	 14912.77
chroma_hps[32x16]	4.34x 	 31368.87 	 136237.14
chroma_hps[16x32]	1.79x 	 29065.31 	 52099.11
chroma_hps[  8x6]	1.85x 	 3842.61  	 7092.53
chroma_hps[  6x8]	1.77x 	 3760.28  	 6662.62
chroma_hps[  8x2]	1.85x 	 2145.25  	 3978.48
chroma_hps[  2x8]	1.47x 	 1730.01  	 2545.01
chroma_hps[16x12]	1.77x 	 12595.11 	 22332.64
chroma_hps[12x16]	1.77x 	 12095.12 	 21372.96
chroma_hps[ 16x4]	1.80x 	 5800.24  	 10418.40
chroma_hps[ 4x16]	1.74x 	 4400.00  	 7643.22
chroma_hps[32x24]	4.37x 	 44308.78 	 193781.08
chroma_hps[24x32]	3.77x 	 43365.97 	 163699.78
chroma_hps[ 32x8]	4.33x 	 18220.22 	 78915.17
chroma_hps[ 8x32]	1.84x 	 14875.27 	 27300.11
chroma_hps[  4x8]	1.68x 	 2700.06  	 4527.49
chroma_hps[ 8x16]	1.82x 	 8200.08  	 14910.00
chroma_hps[16x32]	1.79x 	 29000.16 	 51820.20
chroma_hps[32x64]	4.38x 	 109560.25 	 479846.81
chroma_hps[  4x4]	1.79x 	 1572.82  	 2815.21
chroma_hps[  2x8]	1.47x 	 1730.10  	 2549.97
chroma_hps[  8x8]	1.79x 	 4890.03  	 8749.99
chroma_hps[ 4x16]	1.74x 	 4399.99  	 7647.49
chroma_hps[16x16]	1.78x 	 15855.35 	 28211.13
chroma_hps[ 8x32]	1.84x 	 14865.23 	 27300.26
chroma_hps[32x32]	4.37x 	 57474.61 	 251064.52
chroma_hps[16x64]	1.80x 	 55211.66 	 99111.59
chroma_hps[ 8x12]	1.81x 	 6550.03  	 11827.51
chroma_hps[ 6x16]	1.81x 	 6240.21  	 11299.97
chroma_hps[  8x4]	1.86x 	 2972.75  	 5515.43
chroma_hps[ 2x16]	1.59x 	 2661.93  	 4229.99
chroma_hps[16x24]	1.79x 	 22375.11 	 39970.46
chroma_hps[12x32]	1.78x 	 22055.13 	 39222.56
chroma_hps[ 16x8]	1.76x 	 9334.98  	 16452.48
chroma_hps[ 4x32]	1.76x 	 7910.12  	 13933.02
chroma_hps[32x48]	4.38x 	 83471.63 	 365354.94
chroma_hps[24x64]	3.79x 	 82642.20 	 312817.06
chroma_hps[32x16]	4.37x 	 31278.96 	 136734.84
chroma_hps[ 8x64]	1.86x 	 28032.05 	 52264.57
chroma_hps[  4x4]	1.79x 	 1572.48  	 2810.34
chroma_hps[  8x8]	1.79x 	 4890.07  	 8752.48
chroma_hps[16x16]	1.78x 	 15856.51 	 28212.94
chroma_hps[32x32]	4.36x 	 57629.55 	 251098.31
chroma_hps[64x64]	4.48x 	 218639.95 	 980187.25
chroma_hps[  8x4]	1.86x 	 2972.61  	 5514.96
chroma_hps[  4x8]	1.68x 	 2694.99  	 4527.69
chroma_hps[ 16x8]	1.76x 	 9330.05  	 16449.97
chroma_hps[ 8x16]	1.82x 	 8195.16  	 14910.27
chroma_hps[32x16]	4.37x 	 31269.81 	 136657.97
chroma_hps[16x32]	1.80x 	 28982.21 	 52138.08
chroma_hps[64x32]	4.43x 	 114447.45 	 506502.94
chroma_hps[32x64]	4.38x 	 109630.23 	 479973.81
chroma_hps[16x12]	1.77x 	 12590.04 	 22332.41
chroma_hps[12x16]	1.77x 	 12090.26 	 21373.04
chroma_hps[ 16x4]	1.80x 	 5795.19  	 10418.04
chroma_hps[ 4x16]	1.74x 	 4395.00  	 7642.60
chroma_hps[32x24]	4.38x 	 44240.22 	 193966.91
chroma_hps[24x32]	3.77x 	 43525.50 	 163990.25
chroma_hps[ 32x8]	4.35x 	 18219.50 	 79262.66
chroma_hps[ 8x32]	1.85x 	 14875.31 	 27582.50
chroma_hps[64x48]	4.51x 	 166509.48 	 750447.31
chroma_hps[48x64]	4.37x 	 164116.52 	 717985.25
chroma_hps[64x16]	4.42x 	 62320.70 	 275730.50
chroma_hps[16x64]	1.79x 	 55302.99 	 99178.89

32-bit

./test/TestBench --testbench interp | grep chroma_hps
chroma_hps[  4x4]	1.84x 	 1635.25  	 3002.56
chroma_hps[  8x8]	1.81x 	 4950.23  	 8950.84
chroma_hps[16x16]	1.82x 	 15833.09 	 28756.21
chroma_hps[32x32]	2.85x 	 58060.22 	 165384.47
chroma_hps[  4x2]	1.84x 	 1210.67  	 2223.03
chroma_hps[  2x4]	1.64x 	 1060.56  	 1734.07
chroma_hps[  8x4]	1.89x 	 3030.20  	 5722.98
chroma_hps[  4x8]	1.69x 	 2769.99  	 4680.27
chroma_hps[ 16x8]	1.78x 	 9352.70  	 16650.36
chroma_hps[ 8x16]	1.84x 	 8260.39  	 15190.88
chroma_hps[32x16]	2.88x 	 31450.75 	 90464.86
chroma_hps[16x32]	1.81x 	 28938.98 	 52250.69
chroma_hps[  8x6]	1.87x 	 3912.69  	 7312.47
chroma_hps[  6x8]	1.73x 	 3837.58  	 6620.48
chroma_hps[  8x2]	1.88x 	 2210.28  	 4163.42
chroma_hps[  2x8]	1.51x 	 1785.43  	 2690.27
chroma_hps[16x12]	1.79x 	 12595.43 	 22570.54
chroma_hps[12x16]	1.81x 	 12033.56 	 21830.21
chroma_hps[ 16x4]	1.82x 	 5835.04  	 10620.58
chroma_hps[ 4x16]	1.74x 	 4471.59  	 7800.57
chroma_hps[32x24]	2.87x 	 44425.31 	 127532.01
chroma_hps[24x32]	2.91x 	 43241.68 	 125888.24
chroma_hps[ 32x8]	2.85x 	 18332.81 	 52325.82
chroma_hps[ 8x32]	1.86x 	 14912.98 	 27770.14
chroma_hps[  4x8]	1.69x 	 2772.78  	 4680.42
chroma_hps[ 8x16]	1.84x 	 8260.74  	 15192.68
chroma_hps[16x32]	1.82x 	 29000.77 	 52737.20
chroma_hps[32x64]	2.87x 	 110187.34 	 315945.28
chroma_hps[  4x4]	1.83x 	 1640.56  	 3003.12
chroma_hps[  2x8]	1.51x 	 1785.26  	 2689.93
chroma_hps[  8x8]	1.81x 	 4950.69  	 8949.97
chroma_hps[ 4x16]	1.74x 	 4470.17  	 7799.97
chroma_hps[16x16]	1.80x 	 15835.98 	 28490.66
chroma_hps[ 8x32]	1.86x 	 14912.46 	 27772.34
chroma_hps[32x32]	2.87x 	 57640.21 	 165431.17
chroma_hps[16x64]	1.81x 	 55077.57 	 99806.91
chroma_hps[ 8x12]	1.83x 	 6610.44  	 12070.25
chroma_hps[ 6x16]	1.77x 	 6318.38  	 11179.97
chroma_hps[  8x4]	1.89x 	 3030.13  	 5720.32
chroma_hps[ 2x16]	1.60x 	 2729.99  	 4370.72
chroma_hps[16x24]	1.81x 	 22377.56 	 40517.34
chroma_hps[12x32]	1.83x 	 21904.77 	 40024.01
chroma_hps[ 16x8]	1.78x 	 9353.93  	 16651.38
chroma_hps[ 4x32]	1.77x 	 7972.56  	 14099.94
chroma_hps[32x48]	2.93x 	 83770.07 	 245542.20
chroma_hps[24x64]	3.01x 	 82919.81 	 249421.81
chroma_hps[32x16]	2.89x 	 31399.82 	 90718.93
chroma_hps[ 8x64]	1.89x 	 28091.46 	 53014.97
chroma_hps[  4x4]	1.84x 	 1635.45  	 3003.62
chroma_hps[  8x8]	1.81x 	 4953.01  	 8950.02
chroma_hps[16x16]	1.82x 	 15833.00 	 28808.78
chroma_hps[32x32]	2.87x 	 57625.32 	 165363.50
chroma_hps[64x64]	2.84x 	 220439.47 	 625157.88
chroma_hps[  8x4]	1.89x 	 3030.37  	 5722.53
chroma_hps[  4x8]	1.69x 	 2770.17  	 4681.64
chroma_hps[ 16x8]	1.78x 	 9355.00  	 16651.06
chroma_hps[ 8x16]	1.84x 	 8260.25  	 15190.74
chroma_hps[32x16]	2.86x 	 31415.09 	 89991.48
chroma_hps[16x32]	1.83x 	 28951.90 	 52869.09
chroma_hps[64x32]	2.84x 	 114485.48 	 324925.31
chroma_hps[32x64]	2.84x 	 111272.12 	 316104.81
chroma_hps[16x12]	1.79x 	 12594.20 	 22572.86
chroma_hps[12x16]	1.81x 	 12033.10 	 21829.97
chroma_hps[ 16x4]	1.82x 	 5831.06  	 10621.60
chroma_hps[ 4x16]	1.74x 	 4472.70  	 7800.57
chroma_hps[32x24]	2.87x 	 44476.68 	 127812.44
chroma_hps[24x32]	2.91x 	 43258.67 	 126067.69
chroma_hps[ 32x8]	2.84x 	 18384.90 	 52140.39
chroma_hps[ 8x32]	1.86x 	 14912.74 	 27770.40
chroma_hps[64x48]	2.83x 	 168026.05 	 474962.94
chroma_hps[48x64]	2.85x 	 165469.59 	 471339.94
chroma_hps[64x16]	2.84x 	 62722.09 	 177881.62
chroma_hps[16x64]	1.81x 	 55289.16 	 100343.34

diff -r 234bc93bd516 -r 4330ef5ddfcb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu May 21 19:24:07 2015 -0700
@@ -1614,6 +1614,9 @@
         ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
         ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
         ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_420_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_422_PU(filter_hps, interp_4tap_horiz_ps, sse3);
+        ALL_CHROMA_444_PU(filter_hps, interp_4tap_horiz_ps, sse3);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r 234bc93bd516 -r 4330ef5ddfcb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu May 21 19:24:07 2015 -0700
@@ -641,6 +641,123 @@
     IPFILTER_CHROMA_sse3 48, 64
     IPFILTER_CHROMA_sse3 64, 16
 
+%macro FILTER_2 2
+    movd        m3,     [srcq + %1]
+    movd        m4,     [srcq + 1 + %1]
+    punpckldq   m3,     m4
+    punpcklbw   m3,     m0
+    pmaddwd     m3,     m1
+    packssdw    m3,     m3
+    pshuflw     m4,     m3, q2301
+    paddw       m3,     m4
+    psrldq      m3,     2
+    psubw       m3,     m2
+    movd        [dstq + %2], m3
+%endmacro
+
+%macro FILTER_4 2
+    movd        m3,     [srcq + %1]
+    movd        m4,     [srcq + 1 + %1]
+    punpckldq   m3,     m4
+    punpcklbw   m3,     m0
+    pmaddwd     m3,     m1
+    movd        m4,     [srcq + 2 + %1]
+    movd        m5,     [srcq + 3 + %1]
+    punpckldq   m4,     m5
+    punpcklbw   m4,     m0
+    pmaddwd     m4,     m1
+    packssdw    m3,     m4
+    pshuflw     m4,     m3, q2301
+    pshufhw     m4,     m4, q2301
+    paddw       m3,     m4
+    psrldq      m3,     2
+    pshufd      m3,     m3,     q3120
+    psubw       m3,     m2
+    movh        [dstq + %2], m3
+%endmacro
+
+%macro FILTER_4TAP_HPS_sse3 2
+INIT_XMM sse3
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    add         dststrided, dststrided
+    mova        m2,     [pw_2000]
+    pxor        m0,     m0
+
+%ifdef PIC
+    lea         r6,     [tabw_ChromaCoeff]
+    movddup     m1,     [r6 + r4 * 8]
+%else
+    movddup     m1,     [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+    mov        r4d,     %2
+    cmp        r5m,     byte 0
+    je         .loopH
+    sub        srcq,    srcstrideq
+    add        r4d,     3
+
+.loopH:
+%assign x -1
+%assign y 0
+%rep %1/4
+    FILTER_4 x,y
+%assign x x+4
+%assign y y+8
+%endrep
+%rep (%1 % 4)/2
+    FILTER_2 x,y
+%endrep
+    add         srcq,   srcstrideq
+    add         dstq,   dststrideq
+
+    dec         r4d
+    jnz         .loopH
+    RET
+
+%endmacro
+
+    FILTER_4TAP_HPS_sse3 2, 4
+    FILTER_4TAP_HPS_sse3 2, 8
+    FILTER_4TAP_HPS_sse3 2, 16
+    FILTER_4TAP_HPS_sse3 4, 2
+    FILTER_4TAP_HPS_sse3 4, 4
+    FILTER_4TAP_HPS_sse3 4, 8
+    FILTER_4TAP_HPS_sse3 4, 16
+    FILTER_4TAP_HPS_sse3 4, 32
+    FILTER_4TAP_HPS_sse3 6, 8
+    FILTER_4TAP_HPS_sse3 6, 16
+    FILTER_4TAP_HPS_sse3 8, 2
+    FILTER_4TAP_HPS_sse3 8, 4
+    FILTER_4TAP_HPS_sse3 8, 6
+    FILTER_4TAP_HPS_sse3 8, 8
+    FILTER_4TAP_HPS_sse3 8, 12
+    FILTER_4TAP_HPS_sse3 8, 16
+    FILTER_4TAP_HPS_sse3 8, 32
+    FILTER_4TAP_HPS_sse3 8, 64
+    FILTER_4TAP_HPS_sse3 12, 16
+    FILTER_4TAP_HPS_sse3 12, 32
+    FILTER_4TAP_HPS_sse3 16, 4
+    FILTER_4TAP_HPS_sse3 16, 8
+    FILTER_4TAP_HPS_sse3 16, 12
+    FILTER_4TAP_HPS_sse3 16, 16
+    FILTER_4TAP_HPS_sse3 16, 24
+    FILTER_4TAP_HPS_sse3 16, 32
+    FILTER_4TAP_HPS_sse3 16, 64
+    FILTER_4TAP_HPS_sse3 24, 32
+    FILTER_4TAP_HPS_sse3 24, 64
+    FILTER_4TAP_HPS_sse3 32,  8
+    FILTER_4TAP_HPS_sse3 32, 16
+    FILTER_4TAP_HPS_sse3 32, 24
+    FILTER_4TAP_HPS_sse3 32, 32
+    FILTER_4TAP_HPS_sse3 32, 48
+    FILTER_4TAP_HPS_sse3 32, 64
+    FILTER_4TAP_HPS_sse3 48, 64
+    FILTER_4TAP_HPS_sse3 64, 16
+    FILTER_4TAP_HPS_sse3 64, 32
+    FILTER_4TAP_HPS_sse3 64, 48
+    FILTER_4TAP_HPS_sse3 64, 64
+
 %macro FILTER_H8_W8_sse2 0
     movh        m1, [r0 + x - 3]
     movh        m4, [r0 + x - 2]
diff -r 234bc93bd516 -r 4330ef5ddfcb source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/ipfilter8.h	Thu May 21 19:24:07 2015 -0700
@@ -854,6 +854,47 @@
 void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_ps_2x4_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_2x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_2x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_4x2_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_4x4_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_4x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_4x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_4x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_6x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_6x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x2_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x4_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x6_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x12_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_8x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_12x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_12x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_12x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x4_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x12_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x24_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_16x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_24x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_24x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x8_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x24_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x48_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_32x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_64x16_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_64x32_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_64x48_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_64x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_4tap_horiz_ps_48x64_sse3(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
 void x265_interp_8tap_horiz_pp_4x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_8tap_horiz_pp_4x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_8tap_horiz_pp_4x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);


More information about the x265-devel mailing list