[x265] [PATCH 1 of 3] asm: interp_4tap_vert_pp sse2
dave
dtyx265 at gmail.com
Thu May 7 01:51:30 CEST 2015
On 05/06/2015 01:29 PM, chen wrote:
> At 2015-05-07 03:45:35,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuen <dtyx265 at gmail.com>
> ># Date 1430940440 25200
> ># Node ID 4690c9aa24caa1adb665355803d4c308a124ec96
> ># Parent 87d6724649df0157786c4210f0caebf961b31341
> >asm: interp_4tap_vert_pp sse2
> >
> >This replaces c code for 2x4, 2x8 and 2x16
> >
> >64-bit
> >
> >./test/TestBench --testbench interp | grep vpp
> >chroma_vpp[ 2x4] 1.76x 659.96 1159.98
> >chroma_vpp[ 2x8] 1.68x 1232.42 2067.47
> >chroma_vpp[ 2x8] 1.69x 1226.56 2067.48
> >chroma_vpp[ 2x16] 1.92x 2352.47 4509.99
> >
> >32-bit
> >
> >./test/TestBench --testbench interp | grep vpp
> >chroma_vpp[ 2x4] 2.00x 809.98 1617.42
> >chroma_vpp[ 2x8] 2.13x 1324.95 2817.42
> >chroma_vpp[ 2x8] 2.13x 1324.99 2817.45
> >chroma_vpp[ 2x16] 2.61x 2439.97 6358.08
> >
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/asm-primitives.cpp Wed May 06 12:27:20 2015 -0700
> >@@ -1356,6 +1356,10 @@
> > ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> > p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> > p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
> >+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_sse2;
> >+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_sse2;
> >+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_sse2;
> >+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_sse2;
> >
> > //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> > p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/ipfilter8.asm Wed May 06 12:27:20 2015 -0700
> >@@ -74,6 +74,15 @@
> > dw -2, 16, 54, -4
> > dw -2, 10, 58, -2
> >
> >+const tabw_ChromaCoeffV, times 2 dw 0, 0, 64, 0
> when you modify instruction, you can share this table with tab_ChromaCoeff (rename to pw_*)
> >+ times 2 dw -2, 10, 58, -2
> >+ times 2 dw -4, 16, 54, -2
> >+ times 2 dw -6, 28, 46, -4
> >+ times 2 dw -4, 36, 36, -4
> >+ times 2 dw -4, 46, 28, -6
> >+ times 2 dw -2, 54, 16, -4
> >+ times 2 dw -2, 58, 10, -2
> >+
> > const tab_ChromaCoeff_V, times 8 db 0, 64
> > times 8 db 0, 0
> >
> >@@ -296,6 +305,7 @@
> >
> > SECTION .text
> >
> >+cextern pb_0
> > cextern pb_128
> > cextern pw_1
> > cextern pw_32
> >@@ -1043,6 +1053,129 @@
> > IPFILTER_LUMA_sse2 16, 64, ps
> >
> > ;-----------------------------------------------------------------------------
> >+; void interp_4tap_vert_pp_2xn(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> >+;-----------------------------------------------------------------------------
> >+%macro FILTER_V4_W2_H4_sse2 1
> >+INIT_XMM sse2
> >+%if ARCH_X86_64
> >+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 9
> >+%define PB_0 m8
> >+ pxor m8, m8
> >+%else
> >+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 8
> >+%define PB_0 [pb_0]
> maybe can remove when some instrction magic show below
> >+%endif
> >+
> >+ mov r4d, r4m
> >+ sub r0, r1
> >+ add r4d, r4d
> >+
> >+%ifdef PIC
> >+ lea r5, [tabw_ChromaCoeffV]
> >+ mova m0, [r5 + r4 * 8]
> >+%else
> >+ mova m0, [tabw_ChromaCoeffV + r4 * 8]
> >+%endif
> >+
> >+ mova m1, [pw_32]
> >+ lea r5, [3 * r1]
> >+
> >+%assign x 1
> >+%rep %1/4
> >+ movd m2, [r0]
> >+ movd m3, [r0 + r1]
> >+ movd m4, [r0 + 2 * r1]
> >+ movd m5, [r0 + r5]
> >+
> >+ punpcklbw m2, m3
> >+ punpcklbw m6, m4, m5
> >+ punpcklbw m2, m6
> if you use punpcklwd, you can share table at above
If you are referring to something like the avx2 version of this
primitive that also uses pshufb which is ssse3. Adapting this to <ssse3
is going to be slower.
By the way, the sse4 version could be improved by using a byte version
of the table I added.
> >+
> >+ punpcklbw m2, PB_0
> convert byte to word, can replace by PUNPCKLBW+PSRLW
This performs significantly slower in the testbench.
>
> >+ pmaddwd m2, m0
> >+
> >+ lea r0, [r0 + 4 * r1]
> >+ movd m6, [r0]
> >+
> >+ punpcklbw m3, m4
> >+ punpcklbw m7, m5, m6
> >+ punpcklbw m3, m7
> >+
> >+ punpcklbw m3, PB_0
> >+ pmaddwd m3, m0
> >+
> >+ packssdw m2, m3
> >+ pshuflw m3, m2, q2301
> >+ pshufhw m3, m3, q2301
> >+ paddw m2, m3
> >+ psrld m2, 16
> >+
> >+ movd m7, [r0 + r1]
> >+
> >+ punpcklbw m4, m5
> >+ punpcklbw m3, m6, m7
> >+ punpcklbw m4, m3
> >+
> >+ punpcklbw m4, PB_0
> >+ pmaddwd m4, m0
> >+
> >+ movd m3, [r0 + 2 * r1]
> >+
> >+ punpcklbw m5, m6
> >+ punpcklbw m7, m3
> >+ punpcklbw m5, m7
> >+
> >+ punpcklbw m5, PB_0
> >+ pmaddwd m5, m0
> >+
> >+ packssdw m4, m5
> >+ pshuflw m5, m4, q2301
> >+ pshufhw m5, m5, q2301
> >+ paddw m4, m5
> >+ psrld m4, 16
> >+
> >+ packssdw m2, m4
> >+ paddw m2, m1
> >+ psraw m2, 6
> >+ packuswb m2, m2
> >+
> >+%if ARCH_X86_64
> >+ movq r4, m2
> >+ mov [r2], r4w
> >+ shr r4, 16
> >+ mov [r2 + r3], r4w
> >+ lea r2, [r2 + 2 * r3]
> >+ shr r4, 16
> >+ mov [r2], r4w
> >+ shr r4, 16
> >+ mov [r2 + r3], r4w
> >+%else
> >+ movd r4, m2
> >+ mov [r2], r4w
> >+ shr r4, 16
> >+ mov [r2 + r3], r4w
> >+ lea r2, [r2 + 2 * r3]
> >+ psrldq m2, 4
> >+ movd r4, m2
> >+ mov [r2], r4w
> >+ shr r4, 16
> >+ mov [r2 + r3], r4w
> >+%endif
> >+
> >+%if x < %1/4
> >+ lea r2, [r2 + 2 * r3]
> >+%endif
> >+%assign x x+1
> >+%endrep
> >+ RET
> >+
> >+%endmacro
> >+
> >+ FILTER_V4_W2_H4_sse2 4
> >+ FILTER_V4_W2_H4_sse2 8
> >+ FILTER_V4_W2_H4_sse2 16
> >+
> >+;-----------------------------------------------------------------------------
> > ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.h
> >--- a/source/common/x86/ipfilter8.h Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/ipfilter8.h Wed May 06 12:27:20 2015 -0700
> >@@ -905,6 +905,9 @@
> > void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> > void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> > void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> >+void x265_interp_4tap_vert_pp_2x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> >+void x265_interp_4tap_vert_pp_2x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> >+void x265_interp_4tap_vert_pp_2x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> > #undef LUMA_FILTERS
> > #undef LUMA_SP_FILTERS
> > #undef LUMA_SS_FILTERS
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150506/3be75a36/attachment.html>
More information about the x265-devel
mailing list