<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre><br>At 2015-04-22 09:13:47,dtyx265@gmail.com wrote:
># HG changeset patch
># User David T Yuen <a href="mailto:dtyx265@gmail.com>># Date 1429665160 25200># Node ID defd1cf26749f3395750ef9128c9a90bfa2caf78># Parent c135c117ffb083a00d4353279ea669e8f3f7a8ee>asm: interp_4tap_horiz_pp sse3>>This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,>16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,>64x48, 64x64>>Macros are used to add the primitives to asm-primitives.cpp>>64-bit>">dtyx265@gmail.com>
># Date 1429665160 25200
># Node ID defd1cf26749f3395750ef9128c9a90bfa2caf78
># Parent c135c117ffb083a00d4353279ea669e8f3f7a8ee
>asm: interp_4tap_horiz_pp sse3
>
>This replaces c code for 6x8, 6x16, 8x2, 8x4, 8x6, 8x8, 8x12, 8x16, 8x32, 8x64, 12x16, 12x32, 16x8, 16x12,
>16x16, 16x24, 16x32, 16x64, 24x32, 24x64, 32x8, 32x16, 32x24, 32x32, 32x48, 32x64, 48x64, 64x16, 64x32,
>64x48, 64x64
>
>Macros are used to add the primitives to asm-primitives.cpp
>
>64-bit
>
</a>
>diff -r c135c117ffb0 -r defd1cf26749 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Tue Apr 21 13:42:36 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Tue Apr 21 18:12:40 2015 -0700
>@@ -1407,18 +1407,9 @@
> }
> if (cpuMask & X265_CPU_SSE3)
> {
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_sse3;
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_sse3;
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
>- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = x265_interp_4tap_horiz_pp_2x16_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = x265_interp_4tap_horiz_pp_4x4_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = x265_interp_4tap_horiz_pp_4x16_sse3;
>- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = x265_interp_4tap_horiz_pp_4x32_sse3;
>+ ALL_CHROMA_420_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+ ALL_CHROMA_422_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
>+ ALL_CHROMA_444_PU(filter_hpp, interp_4tap_horiz_pp, sse3);
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
>diff -r c135c117ffb0 -r defd1cf26749 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Tue Apr 21 13:42:36 2015 -0500
>+++ b/source/common/x86/ipfilter8.asm Tue Apr 21 18:12:40 2015 -0700
>@@ -578,6 +578,285 @@
>
> RET
>
>+%macro FILTER_H4_w6_sse2 0
>+ pxor m4, m4
>+ movh m0, [srcq - 1]
>+ movh m5, [srcq]
>+ punpckldq m0, m5
>+ movhlps m2, m0
>+ punpcklbw m0, m4
>+ punpcklbw m2, m4
>+ movd m1, [srcq + 1]
>+ movd m5, [srcq + 2]
>+ punpckldq m1, m5
>+ punpcklbw m1, m4
>+ pmaddwd m0, m6
>+ pmaddwd m1, m6
>+ pmaddwd m2, m6
>+ packssdw m0, m1
>+ packssdw m2, m2
>+ pshuflw m1, m0, q2301
>+ pshufhw m1, m1, q2301
>+ pshuflw m3, m2, q2301
>+ paddw m0, m1
>+ paddw m2, m3
>+ psrld m0, 16
>+ psrld m2, 16
>+ packssdw m0, m2
>+ paddw m0, m7
>+ psraw m0, 6
>+ packuswb m0, m0
>+ movd [dstq], m0
>+ pextrw r4d, m0, 2
>+ mov [dstq + 4], r4w
>+%endmacro
>+
>+%macro FILH4W8_sse2 1
>+ movh m0, [srcq - 1 + %1]
>+ movh m5, [srcq + %1]
>+ punpckldq m0, m5
>+ movhlps m2, m0
>+ punpcklbw m0, m4
>+ punpcklbw m2, m4
>+ movh m1, [srcq + 1 + %1]
>+ movh m5, [srcq + 2 + %1]
>+ punpckldq m1, m5
>+ movhlps m3, m1
>+ punpcklbw m1, m4
>+ punpcklbw m3, m4
>+ pmaddwd m0, m6
>+ pmaddwd m1, m6
>+ pmaddwd m2, m6
>+ pmaddwd m3, m6
>+ packssdw m0, m1
>+ packssdw m2, m3
>+ pshuflw m1, m0, q2301
>+ pshufhw m1, m1, q2301
>+ pshuflw m3, m2, q2301
>+ pshufhw m3, m3, q2301
>+ paddw m0, m1
>+ paddw m2, m3
>+ psrld m0, 16
>+ psrld m2, 16
>+ packssdw m0, m2
>+ paddw m0, m7
>+ psraw m0, 6
>+%endmacro
>+
>+%macro FILTER_H4_w8_sse2 0
>+ FILH4W8_sse2 0
>+ packuswb m0, m0
>+ movh [dstq], m0
>+%endmacro
>+
>+%macro FILTER_H4_w12_sse2 0
>+ FILH4W8_sse2 0
>+ movd m1, [srcq - 1 + 8]
>+ movd m3, [srcq + 8]
>+ punpckldq m1, m3
>+ punpcklbw m1, m4
>+ movd m2, [srcq + 1 + 8]
>+ movd m3, [srcq + 2 + 8]
>+ punpckldq m2, m3
>+ punpcklbw m2, m4
>+ pmaddwd m1, m6
>+ pmaddwd m2, m6
>+ packssdw m1, m2
>+ pshuflw m2, m1, q2301
>+ pshufhw m2, m2, q2301
>+ paddw m1, m2
>+ psrld m1, 16
>+ packssdw m1, m1
>+ paddw m1, m7
>+ psraw m1, 6
>+ packuswb m0, m1
>+ movh [dstq], m0
>+ psrldq m0, 8
>+ movd [dstq + 8], m0
>+%endmacro
>+
>+%macro FILTER_H4_w16_sse2 0
>+ FILH4W8_sse2 0<br>
>+ packuswb m0, m0
>+ movh [dstq], m0<br>We can found lots of this template, may we merge it into FILH4W8?<br>
</pre></div>