[x265] [PATCH] asm: interp_4tap_horiz_pp sse3

dtyx265 at gmail.com dtyx265 at gmail.com
Wed Apr 22 03:13:47 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1429665160 25200
# Node ID defd1cf26749f3395750ef9128c9a90bfa2caf78
# Parent  c135c117ffb083a00d4353279ea669e8f3f7a8ee
asm: interp_4tap_horiz_pp sse3

This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,
16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,
64x48, 64x64

Macros are used to add the primitives to asm-primitives.cpp

64-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	3.01x 	 3090.03  	 9315.59
chroma_hpp[16x16]	3.12x 	 11813.55 	 36843.73
chroma_hpp[32x32]	3.48x 	 46795.53 	 162656.64
chroma_hpp[  8x4]	2.94x 	 1570.01  	 4616.06
chroma_hpp[ 16x8]	3.12x 	 5935.00  	 18531.73
chroma_hpp[ 8x16]	3.02x 	 6132.50  	 18522.87
chroma_hpp[32x16]	3.45x 	 23332.54 	 80401.67
chroma_hpp[16x32]	3.08x 	 23632.81 	 72831.01
chroma_hpp[  8x6]	2.93x 	 2340.00  	 6865.84
chroma_hpp[  6x8]	2.52x 	 2810.02  	 7078.24
chroma_hpp[  8x2]	2.26x 	 812.50   	 1832.25
chroma_hpp[16x12]	3.10x 	 8874.99  	 27547.49
chroma_hpp[12x16]	2.88x 	 9537.65  	 27476.47
chroma_hpp[ 16x4]	3.05x 	 2995.00  	 9139.82
chroma_hpp[32x24]	3.44x 	 35116.93 	 120804.25
chroma_hpp[24x32]	3.49x 	 35175.14 	 122591.48
chroma_hpp[ 32x8]	3.46x 	 11692.55 	 40400.35
chroma_hpp[ 8x32]	2.99x 	 12239.74 	 36603.12
chroma_hpp[ 8x16]	3.02x 	 6132.29  	 18520.52
chroma_hpp[16x32]	3.08x 	 23706.51 	 73120.24
chroma_hpp[32x64]	3.44x 	 93688.69 	 322076.91
chroma_hpp[  8x8]	3.01x 	 3092.83  	 9313.43
chroma_hpp[16x16]	3.11x 	 11812.58 	 36774.01
chroma_hpp[ 8x32]	3.00x 	 12211.42 	 36602.97
chroma_hpp[32x32]	3.45x 	 46773.89 	 161223.02
chroma_hpp[16x64]	3.10x 	 47342.81 	 146912.95
chroma_hpp[ 8x12]	3.04x 	 4612.09  	 14000.78
chroma_hpp[ 6x16]	2.49x 	 5572.52  	 13871.37
chroma_hpp[  8x4]	2.93x 	 1572.50  	 4612.88
chroma_hpp[16x24]	3.08x 	 17693.42 	 54546.35
chroma_hpp[12x32]	2.89x 	 19018.60 	 54936.23
chroma_hpp[ 16x8]	3.10x 	 5935.00  	 18377.62
chroma_hpp[32x48]	3.43x 	 70290.28 	 241380.27
chroma_hpp[24x64]	3.45x 	 70691.30 	 244043.80
chroma_hpp[32x16]	3.45x 	 23336.01 	 80519.96
chroma_hpp[ 8x64]	3.00x 	 24447.52 	 73434.10
chroma_hpp[  8x8]	3.01x 	 3090.03  	 9312.50
chroma_hpp[16x16]	3.12x 	 11815.11 	 36889.00
chroma_hpp[32x32]	3.45x 	 46777.54 	 161214.95
chroma_hpp[64x64]	3.26x 	 195004.23 	 635334.94
chroma_hpp[  8x4]	2.94x 	 1569.99  	 4612.84
chroma_hpp[ 16x8]	3.10x 	 5935.25  	 18378.44
chroma_hpp[ 8x16]	3.02x 	 6132.50  	 18520.00
chroma_hpp[32x16]	3.45x 	 23333.03 	 80500.84
chroma_hpp[16x32]	3.11x 	 23575.12 	 73354.38
chroma_hpp[64x32]	3.44x 	 93016.68 	 319740.38
chroma_hpp[32x64]	3.43x 	 93765.36 	 321706.41
chroma_hpp[16x12]	3.10x 	 8875.39  	 27545.59
chroma_hpp[12x16]	2.88x 	 9545.72  	 27476.17
chroma_hpp[ 16x4]	3.05x 	 2995.00  	 9139.99
chroma_hpp[32x24]	3.44x 	 35117.75 	 120899.98
chroma_hpp[24x32]	3.47x 	 35270.21 	 122474.86
chroma_hpp[ 32x8]	3.48x 	 11695.03 	 40736.42
chroma_hpp[ 8x32]	3.00x 	 12211.35 	 36602.55
chroma_hpp[64x48]	3.40x 	 140230.58 	 477478.03
chroma_hpp[48x64]	3.36x 	 142474.17 	 478603.38
chroma_hpp[64x16]	2.17x 	 73541.03 	 159600.50
chroma_hpp[16x64]	3.18x 	 47272.91 	 150339.42

32-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	2.96x 	 3164.99  	 9352.63
chroma_hpp[16x16]	3.09x 	 11885.01 	 36676.23
chroma_hpp[32x32]	3.47x 	 46802.81 	 162473.30
chroma_hpp[  8x4]	2.86x 	 1645.03  	 4704.06
chroma_hpp[ 16x8]	3.06x 	 6005.04  	 18378.57
chroma_hpp[ 8x16]	2.97x 	 6212.50  	 18430.57
chroma_hpp[32x16]	3.47x 	 23405.02 	 81117.30
chroma_hpp[16x32]	3.09x 	 23645.19 	 73064.45
chroma_hpp[  8x6]	2.89x 	 2405.00  	 6942.87
chroma_hpp[  6x8]	2.46x 	 2905.00  	 7155.64
chroma_hpp[  8x2]	2.69x 	 885.00   	 2379.88
chroma_hpp[16x12]	3.07x 	 8945.03  	 27458.76
chroma_hpp[12x16]	2.89x 	 9607.83  	 27761.21
chroma_hpp[ 16x4]	3.01x 	 3065.00  	 9231.61
chroma_hpp[32x24]	3.45x 	 35195.53 	 121283.95
chroma_hpp[24x32]	3.54x 	 35269.86 	 124809.67
chroma_hpp[ 32x8]	3.50x 	 11765.07 	 41123.75
chroma_hpp[ 8x32]	2.99x 	 12285.04 	 36677.12
chroma_hpp[ 8x16]	2.97x 	 6212.50  	 18430.08
chroma_hpp[16x32]	3.08x 	 23714.90 	 73078.69
chroma_hpp[32x64]	3.48x 	 93567.27 	 325623.00
chroma_hpp[  8x8]	2.95x 	 3165.46  	 9352.40
chroma_hpp[16x16]	3.09x 	 11885.09 	 36737.30
chroma_hpp[ 8x32]	2.96x 	 12285.27 	 36415.00
chroma_hpp[32x32]	3.49x 	 46867.79 	 163765.89
chroma_hpp[16x64]	3.08x 	 47237.70 	 145644.56
chroma_hpp[ 8x12]	2.98x 	 4685.07  	 13965.77
chroma_hpp[ 6x16]	2.47x 	 5665.01  	 13973.91
chroma_hpp[  8x4]	2.86x 	 1645.01  	 4703.00
chroma_hpp[16x24]	3.06x 	 17765.03 	 54399.01
chroma_hpp[12x32]	2.88x 	 19078.06 	 54941.30
chroma_hpp[ 16x8]	3.05x 	 6006.28  	 18330.82
chroma_hpp[32x48]	3.47x 	 70182.23 	 243806.36
chroma_hpp[24x64]	3.52x 	 70337.70 	 247270.98
chroma_hpp[32x16]	3.47x 	 23405.88 	 81119.58
chroma_hpp[ 8x64]	2.97x 	 24510.83 	 72856.95
chroma_hpp[  8x8]	2.95x 	 3165.02  	 9352.55
chroma_hpp[16x16]	3.06x 	 11885.03 	 36391.55
chroma_hpp[32x32]	3.48x 	 46826.54 	 162810.08
chroma_hpp[64x64]	3.27x 	 194595.14 	 636533.44
chroma_hpp[  8x4]	2.86x 	 1644.97  	 4703.04
chroma_hpp[ 16x8]	3.05x 	 6004.99  	 18331.19
chroma_hpp[ 8x16]	2.97x 	 6212.49  	 18430.20
chroma_hpp[32x16]	3.47x 	 23405.07 	 81204.01
chroma_hpp[16x32]	3.08x 	 23645.21 	 72742.43
chroma_hpp[64x32]	3.45x 	 93687.98 	 323369.06
chroma_hpp[32x64]	3.47x 	 94019.62 	 325988.94
chroma_hpp[16x12]	3.06x 	 8945.09  	 27410.79
chroma_hpp[12x16]	2.86x 	 9605.17  	 27448.21
chroma_hpp[ 16x4]	3.01x 	 3065.00  	 9231.73
chroma_hpp[32x24]	3.46x 	 35197.44 	 121608.38
chroma_hpp[24x32]	3.75x 	 35264.05 	 132129.23
chroma_hpp[ 32x8]	3.47x 	 11765.57 	 40848.62
chroma_hpp[ 8x32]	2.96x 	 12285.88 	 36418.82
chroma_hpp[64x48]	3.44x 	 140957.80 	 484415.38
chroma_hpp[48x64]	3.45x 	 139828.09 	 482536.94
chroma_hpp[64x16]	3.45x 	 46615.57 	 160827.50
chroma_hpp[16x64]	3.07x 	 47526.13 	 145913.83

diff -r c135c117ffb0 -r defd1cf26749 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 21 18:12:40 2015 -0700
@@ -1407,18 +1407,9 @@
     }
     if (cpuMask & X265_CPU_SSE3)
     {
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = x265_interp_4tap_horiz_pp_4x32_sse3;
+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r c135c117ffb0 -r defd1cf26749 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Tue Apr 21 18:12:40 2015 -0700
@@ -578,6 +578,285 @@
 
     RET
 
+%macro FILTER_H4_w6_sse2 0
+    pxor        m4, m4
+    movh        m0, [srcq - 1]
+    movh        m5, [srcq]
+    punpckldq   m0, m5
+    movhlps     m2, m0
+    punpcklbw   m0, m4
+    punpcklbw   m2, m4
+    movd        m1, [srcq + 1]
+    movd        m5, [srcq + 2]
+    punpckldq   m1, m5
+    punpcklbw   m1, m4
+    pmaddwd     m0, m6
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    packssdw    m0, m1
+    packssdw    m2, m2
+    pshuflw     m1, m0, q2301
+    pshufhw     m1, m1, q2301
+    pshuflw     m3, m2, q2301
+    paddw       m0, m1
+    paddw       m2, m3
+    psrld       m0, 16
+    psrld       m2, 16
+    packssdw    m0, m2
+    paddw       m0, m7
+    psraw       m0, 6
+    packuswb    m0, m0
+    movd        [dstq], m0
+    pextrw      r4d, m0, 2
+    mov         [dstq + 4], r4w
+%endmacro
+
+%macro FILH4W8_sse2 1
+    movh        m0, [srcq - 1 + %1]
+    movh        m5, [srcq + %1]
+    punpckldq   m0, m5
+    movhlps     m2, m0
+    punpcklbw   m0, m4
+    punpcklbw   m2, m4
+    movh        m1, [srcq + 1 + %1]
+    movh        m5, [srcq + 2 + %1]
+    punpckldq   m1, m5
+    movhlps     m3, m1
+    punpcklbw   m1, m4
+    punpcklbw   m3, m4
+    pmaddwd     m0, m6
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    pmaddwd     m3, m6
+    packssdw    m0, m1
+    packssdw    m2, m3
+    pshuflw     m1, m0, q2301
+    pshufhw     m1, m1, q2301
+    pshuflw     m3, m2, q2301
+    pshufhw     m3, m3, q2301
+    paddw       m0, m1
+    paddw       m2, m3
+    psrld       m0, 16
+    psrld       m2, 16
+    packssdw    m0, m2
+    paddw       m0, m7
+    psraw       m0, 6
+%endmacro
+
+%macro FILTER_H4_w8_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+%endmacro
+
+%macro FILTER_H4_w12_sse2 0
+    FILH4W8_sse2 0
+    movd        m1, [srcq - 1 + 8]
+    movd        m3, [srcq + 8]
+    punpckldq   m1, m3
+    punpcklbw   m1, m4
+    movd        m2, [srcq + 1 + 8]
+    movd        m3, [srcq + 2 + 8]
+    punpckldq   m2, m3
+    punpcklbw   m2, m4
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    packssdw    m1, m2
+    pshuflw     m2, m1, q2301
+    pshufhw     m2, m2, q2301
+    paddw       m1, m2
+    psrld       m1, 16
+    packssdw    m1, m1
+    paddw       m1, m7
+    psraw       m1, 6
+    packuswb    m0, m1
+    movh        [dstq], m0
+    psrldq      m0, 8
+    movd        [dstq + 8], m0
+%endmacro
+
+%macro FILTER_H4_w16_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+    FILH4W8_sse2 8
+    packuswb    m0, m0
+    movh        [dstq + 8], m0
+%endmacro
+
+%macro FILTER_H4_w24_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+    FILH4W8_sse2 8
+    packuswb    m0, m0
+    movh        [dstq + 8], m0
+    FILH4W8_sse2 16
+    packuswb    m0, m0
+    movh        [dstq + 16], m0
+%endmacro
+
+%macro FILTER_H4_w32_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+    FILH4W8_sse2 8
+    packuswb    m0, m0
+    movh        [dstq + 8], m0
+    FILH4W8_sse2 16
+    packuswb    m0, m0
+    movh        [dstq + 16], m0
+    FILH4W8_sse2 24
+    packuswb    m0, m0
+    movh        [dstq + 24], m0
+%endmacro
+
+%macro FILTER_H4_w48_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+    FILH4W8_sse2 8
+    packuswb    m0, m0
+    movh        [dstq + 8], m0
+    FILH4W8_sse2 16
+    packuswb    m0, m0
+    movh        [dstq + 16], m0
+    FILH4W8_sse2 24
+    packuswb    m0, m0
+    movh        [dstq + 24], m0
+    FILH4W8_sse2 32
+    packuswb    m0, m0
+    movh        [dstq + 32], m0
+    FILH4W8_sse2 40
+    packuswb    m0, m0
+    movh        [dstq + 40], m0
+%endmacro
+
+%macro FILTER_H4_w64_sse2 0
+    FILH4W8_sse2 0
+    packuswb    m0, m0
+    movh        [dstq], m0
+    FILH4W8_sse2 8
+    packuswb    m0, m0
+    movh        [dstq + 8], m0
+    FILH4W8_sse2 16
+    packuswb    m0, m0
+    movh        [dstq + 16], m0
+    FILH4W8_sse2 24
+    packuswb    m0, m0
+    movh        [dstq + 24], m0
+    FILH4W8_sse2 32
+    packuswb    m0, m0
+    movh        [dstq + 32], m0
+    FILH4W8_sse2 40
+    packuswb    m0, m0
+    movh        [dstq + 40], m0
+    FILH4W8_sse2 48
+    packuswb    m0, m0
+    movh        [dstq + 48], m0
+    FILH4W8_sse2 56
+    packuswb    m0, m0
+    movh        [dstq + 56], m0
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_sse3 2
+INIT_XMM sse3
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,        r4m
+    mova        m7,         [pw_32]
+    pxor        m4,         m4
+
+%ifdef PIC
+    lea         r5,          [tabw_ChromaCoeff]
+    movddup     m6,       [r5 + r4 * 8]
+%else
+    movddup     m6,       [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %2
+    FILTER_H4_w%1_sse2
+%if x < %2
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endif
+%assign x x+1
+%endrep
+
+    RET
+
+%endmacro
+
+    IPFILTER_CHROMA_sse3 6,   8
+    IPFILTER_CHROMA_sse3 8,   2
+    IPFILTER_CHROMA_sse3 8,   4
+    IPFILTER_CHROMA_sse3 8,   6
+    IPFILTER_CHROMA_sse3 8,   8
+    IPFILTER_CHROMA_sse3 8,  16
+    IPFILTER_CHROMA_sse3 8,  32
+    IPFILTER_CHROMA_sse3 12, 16
+
+    IPFILTER_CHROMA_sse3 6,  16
+    IPFILTER_CHROMA_sse3 8,  12
+    IPFILTER_CHROMA_sse3 8,  64
+    IPFILTER_CHROMA_sse3 12, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_W_sse3 2
+INIT_XMM sse3
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,         r4m
+    mova        m7,         [pw_32]
+    pxor        m4,         m4
+%ifdef PIC
+    lea         r5,          [tabw_ChromaCoeff]
+    movddup     m6,       [r5 + r4 * 8]
+%else
+    movddup     m6,       [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %2
+    FILTER_H4_w%1_sse2
+%if x < %2
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endif
+%assign x x+1
+%endrep
+
+    RET
+
+%endmacro
+
+    IPFILTER_CHROMA_W_sse3 16,  4
+    IPFILTER_CHROMA_W_sse3 16,  8
+    IPFILTER_CHROMA_W_sse3 16, 12
+    IPFILTER_CHROMA_W_sse3 16, 16
+    IPFILTER_CHROMA_W_sse3 16, 32
+    IPFILTER_CHROMA_W_sse3 32,  8
+    IPFILTER_CHROMA_W_sse3 32, 16
+    IPFILTER_CHROMA_W_sse3 32, 24
+    IPFILTER_CHROMA_W_sse3 24, 32
+    IPFILTER_CHROMA_W_sse3 32, 32
+
+    IPFILTER_CHROMA_W_sse3 16, 24
+    IPFILTER_CHROMA_W_sse3 16, 64
+    IPFILTER_CHROMA_W_sse3 32, 48
+    IPFILTER_CHROMA_W_sse3 24, 64
+    IPFILTER_CHROMA_W_sse3 32, 64
+
+    IPFILTER_CHROMA_W_sse3 64, 64
+    IPFILTER_CHROMA_W_sse3 64, 32
+    IPFILTER_CHROMA_W_sse3 64, 48
+    IPFILTER_CHROMA_W_sse3 48, 64
+    IPFILTER_CHROMA_W_sse3 64, 16
+
 %macro FILTER_H4_w2_2 3
     movh        %2, [srcq - 1]
     pshufb      %2, %2, Tm0
diff -r c135c117ffb0 -r defd1cf26749 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/ipfilter8.h	Tue Apr 21 18:12:40 2015 -0700
@@ -814,6 +814,38 @@
 void x265_interp_4tap_horiz_pp_4x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_4x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_4x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_6x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_6x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x2_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x4_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x6_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x12_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_12x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_12x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x4_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x12_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x24_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_24x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_24x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x24_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_48x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS


More information about the x265-devel mailing list