[x265] [PATCH] Luma_hpp[16x16] avx2 asm code : improved 2307c->1695c

Aasaipriya Chandran aasaipriya at multicorewareinc.com
Thu Nov 13 06:13:26 CET 2014


Hello Chen,

Thanks for the comments, I will make necessary changes and send the patch
again.


Regards,

Aasaipriya C

On Wed, Nov 12, 2014 at 10:16 PM, chen <chenm003 at 163.com> wrote:

>
>
>
>
> At 2014-11-12 18:56:45,aasaipriya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Aasaipriya
> ># Date 1415788047 -19800
> >#      Wed Nov 12 15:57:27 2014 +0530
> ># Node ID f0a17f2b4c22ff8aa05afb40e3360e5ac03590a6
> ># Parent  98fb658f3229ab10e808204c265a12e18d71638e
> >Luma_hpp[16x16] avx2 asm code : improved 2307c->1695c
> >
> >diff -r 98fb658f3229 -r f0a17f2b4c22 source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp	Tue Nov 11 19:30:19 2014 +0900
> >+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 12 15:57:27 2014 +0530
> >@@ -1799,6 +1799,7 @@
> >         p.transpose[BLOCK_64x64] = x265_transpose64_avx2;
> > #endif
> >         p.luma_hpp[LUMA_4x4] = x265_interp_8tap_horiz_pp_4x4_avx2;
> >+        p.luma_hpp[LUMA_16x16] = x265_interp_8tap_horiz_pp_16x16_avx2;
> >         p.luma_vpp[LUMA_4x4] = x265_interp_8tap_vert_pp_4x4_avx2;
> >     }
> > #endif // if HIGH_BIT_DEPTH
> >diff -r 98fb658f3229 -r f0a17f2b4c22 source/common/x86/ipfilter8.asm
> >--- a/source/common/x86/ipfilter8.asm	Tue Nov 11 19:30:19 2014 +0900
> >+++ b/source/common/x86/ipfilter8.asm	Wed Nov 12 15:57:27 2014 +0530
> >@@ -854,6 +854,58 @@
> >     RET
> >
> >
> >+INIT_YMM avx2
> >+cglobal interp_8tap_horiz_pp_16x16, 4,6,8
> >+    sub             r0, 3
> >+    mov             r4d, r4m
> >+%ifdef PIC
> >+    lea             r5, [tab_LumaCoeff]
> >+    vpbroadcastd    m0, [r5 + r4 * 8]
> >+    vpbroadcastd    m1, [r5 + r4 * 8 + 4]
> >+%else
> >+    vpbroadcastd    m0, [tab_LumaCoeff + r4 * 8]
> >+    vpbroadcastd    m1, [tab_LumaCoeff + r4 * 8 + 4]
> >+%endif
> >+    mova            m2, [tab_Tm]
> >+    movu            m3, [tab_Tm + 16]
> >+    vpbroadcastd    m7, [pw_1]
> >+
> >+    ; register map
> >+    ; m0 , m1 interpolate coeff
> >+    ; m2 , m2  shuffle order table
> >+    ; m7 - pw_1
> >+
> >+    mov             r4d,  16
> >+.loop:
> >+    ; Row 0
> >+    vbroadcasti128  m4, [r0]                        ; [x E D C B A 9 8 7 6 5 4 3 2 1 0]
> >+    pshufb          m5, m4, m3
> >+    pshufb          m4, m2
> >+    pmaddubsw       m4, m0
> >+    pmaddubsw       m5, m1
> >+    paddw           m4, m5
> >+    pmaddwd         m4, m7
> >+    vbroadcasti128  m5, [r0 + 8]                    ; second 8 elements in Row0
> >+    pshufb          m6, m5, m3
> >+    pshufb          m5, m2
> >+    pmaddubsw       m5, m0
> >+    pmaddubsw       m6, m1
> >+    paddw           m5, m6
> >+    pmaddwd         m5, m7
> >+    packssdw        m4, m5                          ; [17 16 15 14 07 06 05 04 13 12 11 10 03 02 01 00]
> >+    pmulhrsw        m4, [pw_512]
>
> >+    packuswb        m4, m4
> >+    vpermq          m4, m4, 11011000b
> >+    pshufd          xm4, xm4, 11011000b
> >+    movq            [r2], xm4
> >+    movhps          [r2 + 8], xm4
>
> why split into two write instruction?
>
>
>
> >+    lea             r0, [r0 + r1]
> >+    lea             r2, [r2 + r3]
>
> add r0, r1
>
> >+    dec             r4d
> >+    jnz             .loop
> >+    RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141113/bba1c1ac/attachment.html>


More information about the x265-devel mailing list