[x265] [PATCH Review Only] asm: chroma_hpp[32x32] for colorspace i420 in avx2 improve 6189c->4104c

Aasaipriya Chandran aasaipriya at multicorewareinc.com
Wed Dec 10 05:07:35 CET 2014


Hello Chen,

Thanks for your comments.
I will work on the changes you mentioned.

Thanks,
Aasaipriya

On Wed, Dec 10, 2014 at 1:16 AM, chen <chenm003 at 163.com> wrote:

>
>
>
> At 2014-12-09 17:58:14,aasaipriya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Aasaipriya Chandran aasaipriya at multicorewareinc.com>
> ># Date 1418119075 -19800
> >#      Tue Dec 09 15:27:55 2014 +0530
> ># Node ID 896b7cb19a57387bccb16150a197ee8c6911c416
> ># Parent  82622d89fe00d58da4e61740ad80f5cd43b4a607
> >asm: chroma_hpp[32x32] for colorspace i420 in avx2 improve 6189c->4104c
> > <aasaipriya at multicorewareinc.com%3E%3E#%C2%A0Date%C2%A01418119075%C2%A0-19800%3E%23%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0Tue%C2%A0Dec%C2%A009%C2%A015:27:55%C2%A02014%C2%A0+0530%3E%23%C2%A0Node%C2%A0ID%C2%A0896b7cb19a57387bccb16150a197ee8c6911c416%3E%23%C2%A0Parent%C2%A0%C2%A082622d89fe00d58da4e61740ad80f5cd43b4a607%3Easm:%C2%A0chroma_hpp[32x32]%C2%A0for%C2%A0colorspace%C2%A0i420%C2%A0in%C2%A0avx2%C2%A0improve%C2%A06189c-%3E4104c%3E>diff -r 82622d89fe00 -r 896b7cb19a57 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm	Tue Dec 09 15:15:14 2014 +0530
> >+++ b/source/common/x86/ipfilter8.asm	Tue Dec 09 15:27:55 2014 +0530
> >@@ -1515,6 +1515,61 @@
> >     jnz               .loop
> >     RET
> >
> >+INIT_YMM avx2
> >+cglobal interp_4tap_horiz_pp_32x32, 4,6,6
> >+    mov             r4d, r4m
> >+
> >+%ifdef PIC
> >+    lea               r5,           [tab_ChromaCoeff]
> >+    vpbroadcastd      m0,           [r5 + r4 * 4]
> >+%else
> >+    vpbroadcastd      m0,           [tab_ChromaCoeff + r4 * 4]
> >+%endif
> >+
> >+    movu              m1,           [tab_Tm]
>
> constant is aligned
>
>
>
> >+    vpbroadcastd      m2,           [pw_1]
> >+
> >+    ; register map
> >+    ; m0 - interpolate coeff
> >+    ; m1 - shuffle order table
> >+    ; m2 - constant word 1
> >+
> >+    sub               r0,           1
> >+    mov               r4d,          32
> >+
> >+.loop:
> >+    ; Row 0
> >+    vbroadcasti128    m3,           [r0]                        ; [x x x x x A 9 8 7 6 5 4 3 2 1 0]
> >+    pshufb            m3,           m1
> >+    pmaddubsw         m3,           m0
> >+    pmaddwd           m3,           m2
> >+    vbroadcasti128    m4,           [r0 + 8]
> >+    pshufb            m4,           m1
> >+    pmaddubsw         m4,           m0
> >+    pmaddwd           m4,           m2
> >+    packssdw          m3,           m4
> >+    pmulhrsw          m3,           [pw_512]
> >+    vbroadcasti128    m4,           [r0 + 16]
> >+    pshufb            m4,           m1
> >+    pmaddubsw         m4,           m0
> >+    pmaddwd           m4,           m2
> >+    vbroadcasti128    m5,           [r0 + 24]
> >+    pshufb            m5,           m1
> >+    pmaddubsw         m5,           m0
> >+    pmaddwd           m5,           m2
> >+    packssdw          m4,           m5
> >+    pmulhrsw          m4,           [pw_512]
>
> we have more free register to buffer constant
>
>
>
> >+    packuswb          m3,           m4
> >+    mova              m5,           [interp_4tap_8x8_horiz_shuf]
> >+    vpermd            m3,           m5,      m3
>
> >+    vextracti128      xm4,          m3,      1
>
> we can reorder above instuction/algorithm to replace MOVA and PERMD by PERMQ.
>
>
>
> >+    movu              [r2],         xm3
> >+    movu              [r2 + 16],    xm4
>
> why you extract from high 128bits and write to continuous memory?
>
> >+    lea               r2,           [r2 + r3]
> >+    lea               r0,           [r0 + r1]
> >+    dec               r4d
> >+    jnz               .loop
> >+    RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141210/106cfdbe/attachment.html>


More information about the x265-devel mailing list