[x265] [PATCH 1 of 3] asm: interp_4tap_vert_pp sse2

dave dtyx265 at gmail.com
Thu May 7 01:51:30 CEST 2015


On 05/06/2015 01:29 PM, chen wrote:
> At 2015-05-07 03:45:35,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuen <dtyx265 at gmail.com>
> ># Date 1430940440 25200
> ># Node ID 4690c9aa24caa1adb665355803d4c308a124ec96
> ># Parent  87d6724649df0157786c4210f0caebf961b31341
> >asm: interp_4tap_vert_pp sse2
> >
> >This replaces c code for 2x4, 2x8 and 2x16
> >
> >64-bit
> >
> >./test/TestBench --testbench interp | grep vpp
> >chroma_vpp[  2x4]	1.76x 	 659.96   	 1159.98
> >chroma_vpp[  2x8]	1.68x 	 1232.42  	 2067.47
> >chroma_vpp[  2x8]	1.69x 	 1226.56  	 2067.48
> >chroma_vpp[ 2x16]	1.92x 	 2352.47  	 4509.99
> >
> >32-bit
> >
> >./test/TestBench --testbench interp | grep vpp
> >chroma_vpp[  2x4]	2.00x 	 809.98   	 1617.42
> >chroma_vpp[  2x8]	2.13x 	 1324.95  	 2817.42
> >chroma_vpp[  2x8]	2.13x 	 1324.99  	 2817.45
> >chroma_vpp[ 2x16]	2.61x 	 2439.97  	 6358.08
> >
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp	Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/asm-primitives.cpp	Wed May 06 12:27:20 2015 -0700
> >@@ -1356,6 +1356,10 @@
> >         ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> >         p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> >         p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
> >+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_sse2;
> >+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_sse2;
> >+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_vpp = x265_interp_4tap_vert_pp_2x8_sse2;
> >+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_sse2;
> >
> >         //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> >         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm	Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/ipfilter8.asm	Wed May 06 12:27:20 2015 -0700
> >@@ -74,6 +74,15 @@
> >                         dw -2, 16, 54, -4
> >                         dw -2, 10, 58, -2
> >
> >+const tabw_ChromaCoeffV, times 2 dw  0,  0, 64,  0
> when you modify instruction, you can share this table with tab_ChromaCoeff (rename to pw_*)
> >+                         times 2 dw -2, 10, 58, -2
> >+                         times 2 dw -4, 16, 54, -2
> >+                         times 2 dw -6, 28, 46, -4
> >+                         times 2 dw -4, 36, 36, -4
> >+                         times 2 dw -4, 46, 28, -6
> >+                         times 2 dw -2, 54, 16, -4
> >+                         times 2 dw -2, 58, 10, -2
> >+
> > const tab_ChromaCoeff_V, times 8 db 0, 64
> >                          times 8 db 0,  0
> >
> >@@ -296,6 +305,7 @@
> >
> > SECTION .text
> >
> >+cextern pb_0
> > cextern pb_128
> > cextern pw_1
> > cextern pw_32
> >@@ -1043,6 +1053,129 @@
> >     IPFILTER_LUMA_sse2 16, 64, ps
> >
> > ;-----------------------------------------------------------------------------
> >+; void interp_4tap_vert_pp_2xn(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> >+;-----------------------------------------------------------------------------
> >+%macro FILTER_V4_W2_H4_sse2 1
> >+INIT_XMM sse2
> >+%if ARCH_X86_64
> >+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 9
> >+%define PB_0 m8
> >+    pxor        m8,        m8
> >+%else
> >+cglobal interp_4tap_vert_pp_2x%1, 4, 6, 8
> >+%define PB_0 [pb_0]
> maybe can remove when some instrction magic show below
> >+%endif
> >+
> >+    mov         r4d,       r4m
> >+    sub         r0,        r1
> >+    add         r4d,       r4d
> >+
> >+%ifdef PIC
> >+    lea         r5,        [tabw_ChromaCoeffV]
> >+    mova        m0,        [r5 + r4 * 8]
> >+%else
> >+    mova        m0,        [tabw_ChromaCoeffV + r4 * 8]
> >+%endif
> >+
> >+    mova        m1,        [pw_32]
> >+    lea         r5,        [3 * r1]
> >+
> >+%assign x 1
> >+%rep %1/4
> >+    movd        m2,        [r0]
> >+    movd        m3,        [r0 + r1]
> >+    movd        m4,        [r0 + 2 * r1]
> >+    movd        m5,        [r0 + r5]
> >+
> >+    punpcklbw   m2,        m3
> >+    punpcklbw   m6,        m4,        m5
> >+    punpcklbw   m2,        m6
> if you use punpcklwd, you can share table at above
If you are referring to something like the avx2 version of this 
primitive that also uses pshufb which is ssse3.  Adapting this to <ssse3 
is going to be slower.

By the way, the sse4 version could be improved by using a byte version 
of the table I added.
> >+
> >+    punpcklbw   m2,        PB_0
> convert byte to word, can replace by PUNPCKLBW+PSRLW
This performs significantly slower in the testbench.
>   
> >+    pmaddwd     m2,        m0
> >+
> >+    lea         r0,        [r0 + 4 * r1]
> >+    movd        m6,        [r0]
> >+
> >+    punpcklbw   m3,        m4
> >+    punpcklbw   m7,        m5,        m6
> >+    punpcklbw   m3,        m7
> >+
> >+    punpcklbw   m3,        PB_0
> >+    pmaddwd     m3,        m0
> >+
> >+    packssdw    m2,        m3
> >+    pshuflw     m3,        m2,          q2301
> >+    pshufhw     m3,        m3,          q2301
> >+    paddw       m2,        m3
> >+    psrld       m2,        16
> >+
> >+    movd        m7,        [r0 + r1]
> >+
> >+    punpcklbw   m4,        m5
> >+    punpcklbw   m3,        m6,        m7
> >+    punpcklbw   m4,        m3
> >+
> >+    punpcklbw   m4,        PB_0
> >+    pmaddwd     m4,        m0
> >+
> >+    movd        m3,        [r0 + 2 * r1]
> >+
> >+    punpcklbw   m5,        m6
> >+    punpcklbw   m7,        m3
> >+    punpcklbw   m5,        m7
> >+
> >+    punpcklbw   m5,        PB_0
> >+    pmaddwd     m5,        m0
> >+
> >+    packssdw    m4,        m5
> >+    pshuflw     m5,        m4,          q2301
> >+    pshufhw     m5,        m5,          q2301
> >+    paddw       m4,        m5
> >+    psrld       m4,        16
> >+
> >+    packssdw    m2,        m4
> >+    paddw       m2,        m1
> >+    psraw       m2,        6
> >+    packuswb    m2,        m2
> >+
> >+%if ARCH_X86_64
> >+    movq        r4,        m2
> >+    mov         [r2],      r4w
> >+    shr         r4,        16
> >+    mov         [r2 + r3], r4w
> >+    lea         r2,        [r2 + 2 * r3]
> >+    shr         r4,        16
> >+    mov         [r2],      r4w
> >+    shr         r4,        16
> >+    mov         [r2 + r3], r4w
> >+%else
> >+    movd        r4,        m2
> >+    mov         [r2],      r4w
> >+    shr         r4,        16
> >+    mov         [r2 + r3], r4w
> >+    lea         r2,        [r2 + 2 * r3]
> >+    psrldq      m2,        4
> >+    movd        r4,        m2
> >+    mov         [r2],      r4w
> >+    shr         r4,        16
> >+    mov         [r2 + r3], r4w
> >+%endif
> >+
> >+%if x < %1/4
> >+    lea         r2,        [r2 + 2 * r3]
> >+%endif
> >+%assign x x+1
> >+%endrep
> >+    RET
> >+
> >+%endmacro
> >+
> >+    FILTER_V4_W2_H4_sse2 4
> >+    FILTER_V4_W2_H4_sse2 8
> >+    FILTER_V4_W2_H4_sse2 16
> >+
> >+;-----------------------------------------------------------------------------
> > ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
> > ;-----------------------------------------------------------------------------
> > INIT_XMM sse4
> >diff -r 87d6724649df -r 4690c9aa24ca source/common/x86/ipfilter8.h
> >--- a/source/common/x86/ipfilter8.h	Tue May 05 17:05:22 2015 +0530
> >+++ b/source/common/x86/ipfilter8.h	Wed May 06 12:27:20 2015 -0700
> >@@ -905,6 +905,9 @@
> > void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> > void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> > void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
> >+void x265_interp_4tap_vert_pp_2x4_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> >+void x265_interp_4tap_vert_pp_2x8_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> >+void x265_interp_4tap_vert_pp_2x16_sse2(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
> > #undef LUMA_FILTERS
> > #undef LUMA_SP_FILTERS
> > #undef LUMA_SS_FILTERS
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150506/3be75a36/attachment.html>


More information about the x265-devel mailing list