[x265] [PATCH] asm: interp_4tap_horiz_pp sse3

dtyx265 at gmail.com dtyx265 at gmail.com
Wed Apr 22 05:38:12 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1429673867 25200
# Node ID 829814365241f61737f3f39a400f55cc49702679
# Parent  c135c117ffb083a00d4353279ea669e8f3f7a8ee
asm: interp_4tap_horiz_pp sse3

This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,
16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,
64x48, 64x64

Macros are used to add the primitives to asm-primitives.cpp

64-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	3.02x 	 3087.49  	 9315.21
chroma_hpp[16x16]	3.09x 	 11813.11 	 36504.26
chroma_hpp[32x32]	3.45x 	 46862.27 	 161615.81
chroma_hpp[  8x4]	2.94x 	 1567.50  	 4614.73
chroma_hpp[ 16x8]	3.10x 	 5930.00  	 18377.70
chroma_hpp[ 8x16]	3.02x 	 6130.00  	 18520.00
chroma_hpp[32x16]	3.46x 	 23330.07 	 80829.76
chroma_hpp[16x32]	3.16x 	 23572.66 	 74452.23
chroma_hpp[  8x6]	2.93x 	 2339.99  	 6863.20
chroma_hpp[  6x8]	2.52x 	 2812.50  	 7075.69
chroma_hpp[  8x2]	2.25x 	 812.50   	 1830.00
chroma_hpp[16x12]	3.10x 	 8875.07  	 27545.60
chroma_hpp[12x16]	2.80x 	 9810.20  	 27476.36
chroma_hpp[ 16x4]	3.05x 	 2995.20  	 9144.59
chroma_hpp[32x24]	3.45x 	 34970.45 	 120594.13
chroma_hpp[24x32]	3.49x 	 35116.79 	 122662.94
chroma_hpp[ 32x8]	3.45x 	 11699.82 	 40402.34
chroma_hpp[ 8x32]	3.00x 	 12210.00 	 36603.46
chroma_hpp[ 8x16]	3.02x 	 6130.00  	 18520.00
chroma_hpp[16x32]	3.09x 	 23573.10 	 72827.95
chroma_hpp[32x64]	3.58x 	 93938.63 	 335978.50
chroma_hpp[  8x8]	3.02x 	 3087.49  	 9314.74
chroma_hpp[16x16]	3.09x 	 11815.00 	 36545.97
chroma_hpp[ 8x32]	3.02x 	 12212.27 	 36870.14
chroma_hpp[32x32]	3.45x 	 46748.56 	 161259.67
chroma_hpp[16x64]	3.18x 	 47185.50 	 150017.53
chroma_hpp[ 8x12]	3.04x 	 4607.50  	 14000.63
chroma_hpp[ 6x16]	2.49x 	 5570.10  	 13870.01
chroma_hpp[  8x4]	2.94x 	 1570.00  	 4613.64
chroma_hpp[16x24]	3.08x 	 17690.69 	 54547.18
chroma_hpp[12x32]	2.80x 	 19618.33 	 54833.57
chroma_hpp[ 16x8]	3.10x 	 5932.57  	 18377.34
chroma_hpp[32x48]	3.45x 	 70041.92 	 241370.78
chroma_hpp[24x64]	3.53x 	 70596.84 	 249020.33
chroma_hpp[32x16]	3.44x 	 23374.66 	 80340.53
chroma_hpp[ 8x64]	3.00x 	 24422.17 	 73313.97
chroma_hpp[  8x8]	3.01x 	 3090.00  	 9314.26
chroma_hpp[16x16]	3.11x 	 11810.00 	 36736.14
chroma_hpp[32x32]	3.47x 	 46771.40 	 162154.16
chroma_hpp[64x64]	3.25x 	 195843.97 	 636910.44
chroma_hpp[  8x4]	2.94x 	 1570.00  	 4613.35
chroma_hpp[ 16x8]	3.10x 	 5933.42  	 18381.31
chroma_hpp[ 8x16]	3.02x 	 6131.43  	 18520.17
chroma_hpp[32x16]	3.42x 	 23450.76 	 80160.37
chroma_hpp[16x32]	3.09x 	 23619.58 	 73027.41
chroma_hpp[64x32]	3.42x 	 92894.85 	 318107.38
chroma_hpp[32x64]	3.48x 	 93646.98 	 325950.78
chroma_hpp[16x12]	3.10x 	 8874.99  	 27503.11
chroma_hpp[12x16]	2.83x 	 9809.99  	 27769.48
chroma_hpp[ 16x4]	3.05x 	 2994.99  	 9138.53
chroma_hpp[32x24]	3.42x 	 35123.29 	 120115.27
chroma_hpp[24x32]	3.53x 	 35143.41 	 124032.27
chroma_hpp[ 32x8]	3.46x 	 11692.58 	 40400.25
chroma_hpp[ 8x32]	3.02x 	 12212.50 	 36843.57
chroma_hpp[64x48]	3.36x 	 140979.36 	 473912.28
chroma_hpp[48x64]	3.43x 	 140712.88 	 482047.69
chroma_hpp[64x16]	3.39x 	 46530.16 	 157859.31
chroma_hpp[16x64]	3.08x 	 47197.85 	 145477.02

32-bit

./test/TestBench --testbench interp | grep hpp
chroma_hpp[  8x8]	2.96x 	 3164.98  	 9354.15
chroma_hpp[16x16]	3.07x 	 11885.01 	 36438.13
chroma_hpp[32x32]	3.48x 	 46818.91 	 162929.45
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4703.57
chroma_hpp[ 16x8]	3.06x 	 6005.10  	 18378.64
chroma_hpp[ 8x16]	2.97x 	 6205.00  	 18429.90
chroma_hpp[32x16]	3.46x 	 23463.52 	 81110.52
chroma_hpp[16x32]	3.10x 	 23700.07 	 73429.12
chroma_hpp[  8x6]	2.89x 	 2404.99  	 6942.73
chroma_hpp[  6x8]	2.46x 	 2905.00  	 7155.45
chroma_hpp[  8x2]	2.69x 	 885.00   	 2379.96
chroma_hpp[16x12]	3.07x 	 8945.04  	 27458.99
chroma_hpp[12x16]	2.81x 	 9862.55  	 27753.80
chroma_hpp[ 16x4]	3.01x 	 3065.00  	 9231.22
chroma_hpp[32x24]	3.45x 	 35140.03 	 121204.09
chroma_hpp[24x32]	3.51x 	 35262.80 	 123779.88
chroma_hpp[ 32x8]	3.47x 	 11765.00 	 40847.72
chroma_hpp[ 8x32]	2.98x 	 12285.00 	 36623.77
chroma_hpp[ 8x16]	2.97x 	 6205.00  	 18429.95
chroma_hpp[16x32]	3.08x 	 23691.43 	 72971.20
chroma_hpp[32x64]	3.47x 	 93595.39 	 324758.03
chroma_hpp[  8x8]	2.95x 	 3165.39  	 9353.01
chroma_hpp[16x16]	3.07x 	 11885.00 	 36438.18
chroma_hpp[ 8x32]	2.98x 	 12285.21 	 36614.84
chroma_hpp[32x32]	3.48x 	 46794.59 	 162647.84
chroma_hpp[16x64]	3.08x 	 47299.79 	 145605.62
chroma_hpp[ 8x12]	2.98x 	 4685.06  	 13949.95
chroma_hpp[ 6x16]	2.46x 	 5672.50  	 13972.76
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4702.53
chroma_hpp[16x24]	3.06x 	 17765.06 	 54398.70
chroma_hpp[12x32]	2.79x 	 19676.93 	 54843.11
chroma_hpp[ 16x8]	3.06x 	 6005.12  	 18377.65
chroma_hpp[32x48]	3.46x 	 70176.74 	 243033.73
chroma_hpp[24x64]	3.51x 	 70367.40 	 246988.72
chroma_hpp[32x16]	3.47x 	 23405.43 	 81235.64
chroma_hpp[ 8x64]	2.97x 	 24490.71 	 72757.92
chroma_hpp[  8x8]	2.95x 	 3165.00  	 9352.45
chroma_hpp[16x16]	3.07x 	 11885.00 	 36437.35
chroma_hpp[32x32]	3.48x 	 46781.39 	 162731.84
chroma_hpp[64x64]	3.28x 	 193972.66 	 635870.62
chroma_hpp[  8x4]	2.86x 	 1645.00  	 4702.79
chroma_hpp[ 16x8]	3.06x 	 6005.00  	 18377.74
chroma_hpp[ 8x16]	2.97x 	 6205.04  	 18430.28
chroma_hpp[32x16]	3.46x 	 23452.05 	 81121.86
chroma_hpp[16x32]	3.07x 	 23695.18 	 72740.23
chroma_hpp[64x32]	3.42x 	 92974.16 	 317723.12
chroma_hpp[32x64]	3.47x 	 93467.95 	 324431.16
chroma_hpp[16x12]	3.07x 	 8945.09  	 27457.70
chroma_hpp[12x16]	2.79x 	 9862.54  	 27477.89
chroma_hpp[ 16x4]	3.01x 	 3065.02  	 9231.55
chroma_hpp[32x24]	3.45x 	 35161.96 	 121188.20
chroma_hpp[24x32]	3.51x 	 35275.57 	 123776.31
chroma_hpp[ 32x8]	3.47x 	 11765.00 	 40847.59
chroma_hpp[ 8x32]	2.98x 	 12285.06 	 36637.80
chroma_hpp[64x48]	3.41x 	 139693.42 	 476274.88
chroma_hpp[48x64]	3.44x 	 139707.61 	 480515.22
chroma_hpp[64x16]	3.41x 	 46575.90 	 158769.59
chroma_hpp[16x64]	3.08x 	 47262.82 	 145408.81

diff -r c135c117ffb0 -r 829814365241 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 21 20:37:47 2015 -0700
@@ -1407,18 +1407,9 @@
     }
     if (cpuMask & X265_CPU_SSE3)
     {
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
-        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = x265_interp_4tap_horiz_pp_4x32_sse3;
+        ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
+        ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r c135c117ffb0 -r 829814365241 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Tue Apr 21 20:37:47 2015 -0700
@@ -594,6 +594,237 @@
     mov         [dstq + dststrideq], r4w
 %endmacro
 
+%macro FILTER_H4_w6_sse2 0
+    pxor        m4, m4
+    movh        m0, [srcq - 1]
+    movh        m5, [srcq]
+    punpckldq   m0, m5
+    movhlps     m2, m0
+    punpcklbw   m0, m4
+    punpcklbw   m2, m4
+    movd        m1, [srcq + 1]
+    movd        m5, [srcq + 2]
+    punpckldq   m1, m5
+    punpcklbw   m1, m4
+    pmaddwd     m0, m6
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    packssdw    m0, m1
+    packssdw    m2, m2
+    pshuflw     m1, m0, q2301
+    pshufhw     m1, m1, q2301
+    pshuflw     m3, m2, q2301
+    paddw       m0, m1
+    paddw       m2, m3
+    psrld       m0, 16
+    psrld       m2, 16
+    packssdw    m0, m2
+    paddw       m0, m7
+    psraw       m0, 6
+    packuswb    m0, m0
+    movd        [dstq], m0
+    pextrw      r4d, m0, 2
+    mov         [dstq + 4], r4w
+%endmacro
+
+%macro FILH4W8_sse2 1
+    movh        m0, [srcq - 1 + %1]
+    movh        m5, [srcq + %1]
+    punpckldq   m0, m5
+    movhlps     m2, m0
+    punpcklbw   m0, m4
+    punpcklbw   m2, m4
+    movh        m1, [srcq + 1 + %1]
+    movh        m5, [srcq + 2 + %1]
+    punpckldq   m1, m5
+    movhlps     m3, m1
+    punpcklbw   m1, m4
+    punpcklbw   m3, m4
+    pmaddwd     m0, m6
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    pmaddwd     m3, m6
+    packssdw    m0, m1
+    packssdw    m2, m3
+    pshuflw     m1, m0, q2301
+    pshufhw     m1, m1, q2301
+    pshuflw     m3, m2, q2301
+    pshufhw     m3, m3, q2301
+    paddw       m0, m1
+    paddw       m2, m3
+    psrld       m0, 16
+    psrld       m2, 16
+    packssdw    m0, m2
+    paddw       m0, m7
+    psraw       m0, 6
+    packuswb    m0, m0
+    movh        [dstq + %1], m0
+%endmacro
+
+%macro FILTER_H4_w8_sse2 0
+    FILH4W8_sse2 0
+%endmacro
+
+%macro FILTER_H4_w12_sse2 0
+    FILH4W8_sse2 0
+    movd        m1, [srcq - 1 + 8]
+    movd        m3, [srcq + 8]
+    punpckldq   m1, m3
+    punpcklbw   m1, m4
+    movd        m2, [srcq + 1 + 8]
+    movd        m3, [srcq + 2 + 8]
+    punpckldq   m2, m3
+    punpcklbw   m2, m4
+    pmaddwd     m1, m6
+    pmaddwd     m2, m6
+    packssdw    m1, m2
+    pshuflw     m2, m1, q2301
+    pshufhw     m2, m2, q2301
+    paddw       m1, m2
+    psrld       m1, 16
+    packssdw    m1, m1
+    paddw       m1, m7
+    psraw       m1, 6
+    packuswb    m1, m1
+    movd        [dstq + 8], m1
+%endmacro
+
+%macro FILTER_H4_w16_sse2 0
+    FILH4W8_sse2 0
+    FILH4W8_sse2 8
+%endmacro
+
+%macro FILTER_H4_w24_sse2 0
+    FILH4W8_sse2 0
+    FILH4W8_sse2 8
+    FILH4W8_sse2 16
+%endmacro
+
+%macro FILTER_H4_w32_sse2 0
+    FILH4W8_sse2 0
+    FILH4W8_sse2 8
+    FILH4W8_sse2 16
+    FILH4W8_sse2 24
+%endmacro
+
+%macro FILTER_H4_w48_sse2 0
+    FILH4W8_sse2 0
+    FILH4W8_sse2 8
+    FILH4W8_sse2 16
+    FILH4W8_sse2 24
+    FILH4W8_sse2 32
+    FILH4W8_sse2 40
+%endmacro
+
+%macro FILTER_H4_w64_sse2 0
+    FILH4W8_sse2 0
+    FILH4W8_sse2 8
+    FILH4W8_sse2 16
+    FILH4W8_sse2 24
+    FILH4W8_sse2 32
+    FILH4W8_sse2 40
+    FILH4W8_sse2 48
+    FILH4W8_sse2 56
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_sse3 2
+INIT_XMM sse3
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,        r4m
+    mova        m7,         [pw_32]
+    pxor        m4,         m4
+
+%ifdef PIC
+    lea         r5,          [tabw_ChromaCoeff]
+    movddup     m6,       [r5 + r4 * 8]
+%else
+    movddup     m6,       [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %2
+    FILTER_H4_w%1_sse2
+%if x < %2
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endif
+%assign x x+1
+%endrep
+
+    RET
+
+%endmacro
+
+    IPFILTER_CHROMA_sse3 6,   8
+    IPFILTER_CHROMA_sse3 8,   2
+    IPFILTER_CHROMA_sse3 8,   4
+    IPFILTER_CHROMA_sse3 8,   6
+    IPFILTER_CHROMA_sse3 8,   8
+    IPFILTER_CHROMA_sse3 8,  16
+    IPFILTER_CHROMA_sse3 8,  32
+    IPFILTER_CHROMA_sse3 12, 16
+
+    IPFILTER_CHROMA_sse3 6,  16
+    IPFILTER_CHROMA_sse3 8,  12
+    IPFILTER_CHROMA_sse3 8,  64
+    IPFILTER_CHROMA_sse3 12, 32
+
+;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_W_sse3 2
+INIT_XMM sse3
+cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,         r4m
+    mova        m7,         [pw_32]
+    pxor        m4,         m4
+%ifdef PIC
+    lea         r5,          [tabw_ChromaCoeff]
+    movddup     m6,       [r5 + r4 * 8]
+%else
+    movddup     m6,       [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %2
+    FILTER_H4_w%1_sse2
+%if x < %2
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endif
+%assign x x+1
+%endrep
+
+    RET
+
+%endmacro
+
+    IPFILTER_CHROMA_W_sse3 16,  4
+    IPFILTER_CHROMA_W_sse3 16,  8
+    IPFILTER_CHROMA_W_sse3 16, 12
+    IPFILTER_CHROMA_W_sse3 16, 16
+    IPFILTER_CHROMA_W_sse3 16, 32
+    IPFILTER_CHROMA_W_sse3 32,  8
+    IPFILTER_CHROMA_W_sse3 32, 16
+    IPFILTER_CHROMA_W_sse3 32, 24
+    IPFILTER_CHROMA_W_sse3 24, 32
+    IPFILTER_CHROMA_W_sse3 32, 32
+
+    IPFILTER_CHROMA_W_sse3 16, 24
+    IPFILTER_CHROMA_W_sse3 16, 64
+    IPFILTER_CHROMA_W_sse3 32, 48
+    IPFILTER_CHROMA_W_sse3 24, 64
+    IPFILTER_CHROMA_W_sse3 32, 64
+
+    IPFILTER_CHROMA_W_sse3 64, 64
+    IPFILTER_CHROMA_W_sse3 64, 32
+    IPFILTER_CHROMA_W_sse3 64, 48
+    IPFILTER_CHROMA_W_sse3 48, 64
+    IPFILTER_CHROMA_W_sse3 64, 16
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
diff -r c135c117ffb0 -r 829814365241 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Apr 21 13:42:36 2015 -0500
+++ b/source/common/x86/ipfilter8.h	Tue Apr 21 20:37:47 2015 -0700
@@ -814,6 +814,38 @@
 void x265_interp_4tap_horiz_pp_4x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_4x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 void x265_interp_4tap_horiz_pp_4x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_6x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_6x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x2_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x4_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x6_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x12_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_8x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_12x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_12x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x4_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x12_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x24_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_16x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_24x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_24x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x8_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x24_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_32x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_48x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x16_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x32_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x48_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_horiz_pp_64x64_sse3(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS


More information about the x265-devel mailing list