[x265] [PATCH] asm: interp_8tap_horiz pp and ps sse2

dave dtyx265 at gmail.com
Tue Apr 28 06:13:54 CEST 2015


On 04/27/2015 08:42 PM, chen wrote:
>
> At 2015-04-28 09:05:06,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuendtyx265 at gmail.com>
> ># Date 1430182995 25200
> ># Node ID 31b76bd430a47411f7b2ebaa7cfbb44e25c5ff60
> ># Parent  68a13226d586b335c02cade9311e093f0149c42a
> >asm: interp_8tap_horiz pp and ps sse2
> >
> >This replaces c code and covers
> >
>   <mailto:dtyx265 at gmail.com%3E%3E#%A0Date%A01430182995%A025200%3E#%A0Node%A0ID%A031b76bd430a47411f7b2ebaa7cfbb44e25c5ff60%3E#%A0Parent%A0%A068a13226d586b335c02cade9311e093f0149c42a%3Easm:%A0interp_8tap_horiz%A0pp%A0and%A0ps%A0sse2%3E%3EThis%A0replaces%A0c%A0code%A0and%A0covers%3E>
> >diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp	Sat Apr 25 01:39:55 2015 -0500
> >+++ b/source/common/x86/asm-primitives.cpp	Mon Apr 27 18:03:15 2015 -0700
> >@@ -1340,6 +1340,10 @@
> >         CHROMA_420_VSP_FILTERS(_sse2);
> >         CHROMA_422_VSP_FILTERS(_sse2);
> >         CHROMA_444_VSP_FILTERS(_sse2);
> >+        ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
> >+        p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
> >+        ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
> >+        p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
> >
> >         //p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
> >         p.frameInitLowres = x265_frame_init_lowres_core_sse2;
> >diff -r 68a13226d586 -r 31b76bd430a4 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm	Sat Apr 25 01:39:55 2015 -0500
> >+++ b/source/common/x86/ipfilter8.asm	Mon Apr 27 18:03:15 2015 -0700
> >@@ -160,6 +160,11 @@
> >                  db  -1, 4, -11, 40,  40, -11, 4, -1
> >                  db   0, 1, -5,  17,  58, -10, 4, -1
> >
> >+tabw_LumaCoeff:  dw   0, 0,  0,  64,  0,   0,  0,  0
> >+                 dw  -1, 4, -10, 58,  17, -5,  1,  0
> >+                 dw  -1, 4, -11, 40,  40, -11, 4, -1
> >+                 dw   0, 1, -5,  17,  58, -10, 4, -1
> >+
> > tab_LumaCoeffV: times 4 dw 0, 0
> >                 times 4 dw 0, 64
> >                 times 4 dw 0, 0
> >@@ -825,6 +830,230 @@
> >     IPFILTER_CHROMA_W_sse3 48, 64
> >     IPFILTER_CHROMA_W_sse3 64, 16
> >
> >+%macro FILTER_H8_W8_sse2 0
> >+    movh        m1, [r0 + r5 - 3]
> offset field (-3) can avoid by preare on r0
I could also use r5 - 3, r5 - 2, r5 - 1... instead of r5 - 3 + n.
> >+    movh        m4, [r0 + r5 - 3 + 1]
> >+    punpcklbw   m1, m6
> >+    punpcklbw   m4, m6
> >+    movh        m5, [r0 + r5 - 3 + 2]
> >+    movh        m0, [r0 + r5 - 3 + 3]
> >+    punpcklbw   m5, m6
> >+    punpcklbw   m0, m6
> >+    pmaddwd     m1, m3
> >+    pmaddwd     m4, m3
> >+    pmaddwd     m5, m3
> >+    pmaddwd     m0, m3
> >+    packssdw    m1, m4
> >+    packssdw    m5, m0
> >+    pshuflw     m4, m1, q2301
> >+    pshufhw     m4, m4, q2301
> >+    pshuflw     m0, m5, q2301
> >+    pshufhw     m0, m0, q2301
> >+    paddw       m1, m4
> >+    paddw       m5, m0
> >+    psrldq      m1, 2
> >+    psrldq      m5, 2
> >+    pshufd      m1, m1, q3120
> >+    pshufd      m5, m5, q3120
> >+    punpcklqdq  m1, m5
> m1 valid in here
> >+    movh        m7, [r0 + r5 - 3 + 4]
> >+    movh        m4, [r0 + r5 - 3 + 5]
> >+    punpcklbw   m7, m6
> >+    punpcklbw   m4, m6
> >+    movh        m5, [r0 + r5 - 3 + 6]
> >+    movh        m0, [r0 + r5 - 3 + 7]
> >+    punpcklbw   m5, m6
> >+    punpcklbw   m0, m6
> >+    pmaddwd     m7, m3
> >+    pmaddwd     m4, m3
> >+    pmaddwd     m5, m3
> >+    pmaddwd     m0, m3
> >+    packssdw    m7, m4
> >+    packssdw    m5, m0
> >+    pshuflw     m4, m7, q2301
> >+    pshufhw     m4, m4, q2301
> >+    pshuflw     m0, m5, q2301
> >+    pshufhw     m0, m0, q2301
> >+    paddw       m7, m4
> >+    paddw       m5, m0
> >+    psrldq      m7, 2
> >+    psrldq      m5, 2
> >+    pshufd      m7, m7, q3120
> >+    pshufd      m5, m5, q3120
> >+    punpcklqdq  m7, m5
> >+    pshuflw     m4, m1, q2301
> m1 used in here, long distance
I am not sure what is wrong with this.
> >+    pshufhw     m4, m4, q2301
> >+    pshuflw     m0, m7, q2301
> >+    pshufhw     m0, m0, q2301
> >+    paddw       m1, m4
> >+    paddw       m7, m0
> >+    psrldq      m1, 2
> >+    psrldq      m7, 2
> >+    pshufd      m1, m1, q3120
> >+    pshufd      m7, m7, q3120
> >+    punpcklqdq  m1, m7
> >+%endmacro
> >+
>
> >+;----------------------------------------------------------------------------------------------------------------------------
> >+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> >+;----------------------------------------------------------------------------------------------------------------------------
> >+%macro IPFILTER_LUMA_sse2 3
> >+INIT_XMM sse2
> >+cglobal interp_8tap_horiz_%3_%1x%2, 4,7,8
> >+
> >+    mov       r4d, r4m
> >+    add       r4d, r4d
> >+    pxor      m6, m6
> >+%ifdef PIC
> >+    lea       r6, [tabw_LumaCoeff]
> >+    mova      m3, [r6 + r4 * 8]
> >+%else
> >+    mova      m3, [tabw_LumaCoeff + r4 * 8]
> >+%endif
> >+
> >+    mov       r4d, %2
> >+%ifidn %3, pp
> >+    mova      m2, [pw_32]
> >+%else
> >+    mova      m2, [pw_2000]
> >+    add       r3d, r3d
> >+    cmp       r5m, byte 0
> >+    je        .loopH
> >+    lea       r6, [r1 + 2 * r1]
> >+    sub       r0d, r6d
> >+    add       r4d, 7
> >+%endif
> >+
> >+.loopH:
> >+    xor       r5d, r5d
> >+%rep %1 / 8
> use %rep here, why you need physical register r5?
Each iteration of the loop completes a row.  r5 is the index across the 
row and must be reset at the beginning of each loop iteration.
> >+    FILTER_H8_W8_sse2
> >+  %ifidn %3, pp
> >+    paddw     m1, m2
> >+    psraw     m1, 6
> >+    packuswb  m1, m1
> >+    movh      [r2 + r5], m1
> >+  %else
> >+    psubw     m1, m2
> >+    movu      [r2 + 2 * r5], m1
> >+  %endif
> >+    add       r5d, 8
> >+%endrep
> >+
> >+%rep (%1 % 8) / 4
> >+    FILTER_H8_W4_sse2
> >+  %ifidn %3, pp
> >+    paddw     m1, m2
> >+    psraw     m1, 6
> >+    packuswb  m1, m1
> >+    movd      [r2 + r5], m1
> >+  %else
> >+    psubw     m1, m2
> >+    movh      [r2 + 2 * r5], m1
> >+  %endif
> >+%endrep
> >+
> >+    add       r0d, r1d
> >+    add       r2d, r3d
> >+
> >+    dec       r4d
> >+    jnz       .loopH
> >+
> no extra blank line here
> will remove...
> >+    RET
> >+
> >+%endmacro
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150427/910c5de9/attachment-0001.html>


More information about the x265-devel mailing list