[x265] [PATCH] asm: avx2 code for add_ps[8x8] for 10bpp -- 24.9x

chen chenm003 at 163.com
Mon Mar 2 18:45:27 CET 2015


it right, but we can improve

At 2015-03-02 16:47:23,sumalatha at multicorewareinc.com wrote:
># HG changeset patch
># User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
># Date 1425286035 -19800
># Node ID 1be088c8bc675752ebfebc4fda3bad41659269a4
># Parent  a9ad4d8202796dfb78e9d180f5fdb7cc0996ea66
>asm: avx2 code for add_ps[8x8] for 10bpp -- 24.9x
>
>add_ps[  8x8]  24.97x   275.68          6882.88
>
>diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Mon Mar 02 14:10:07 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 02 14:17:15 2015 +0530
>@@ -1069,6 +1069,8 @@
>     }
>     if (cpuMask & X265_CPU_AVX2)
>     {
>+        p.cu[BLOCK_8x8].add_ps = x265_pixel_add_ps_8x8_avx2;
>+
>         p.cu[BLOCK_32x32].ssd_s = x265_pixel_ssd_s_32_avx2;
>         p.cu[BLOCK_16x16].sse_ss = x265_pixel_ssd_ss_16x16_avx2;
> 
>diff -r a9ad4d820279 -r 1be088c8bc67 source/common/x86/pixeladd8.asm
>--- a/source/common/x86/pixeladd8.asm	Mon Mar 02 14:10:07 2015 +0530
>+++ b/source/common/x86/pixeladd8.asm	Mon Mar 02 14:17:15 2015 +0530
>@@ -229,6 +229,53 @@
> 
>     jnz     .loop
>     RET
>+
>+INIT_YMM avx2
>+cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
>+    mova    m5,     [pw_pixel_max]
>+    pxor    m4,     m4
>+    mov     r6d,    %2/4
>+    add     r4,     r4
>+    add     r5,     r5
>+    add     r1,     r1
>+.loop:
>+    movu        xm0,        [r2]        ; row 0 of src0
>+    movu        xm1,        [r2 + r4]   ; row 1 of src0
>+    vinserti128 m0, m0, xm1, 1
>+
>+    movu    xm1,     [r3]               ; row 0 of src1
>+    movu    xm2,     [r3 + r5]          ; row 1 of src1
>+    vinserti128 m1, m1, xm2, 1
>+    lea     r2,     [r2 + r4 * 2]
>+    lea     r3,     [r3 + r5 * 2]
>+
>+    paddw   m0,     m1
in here, we may replace vinsert+vinsert+padd with padd+padd+vinsert, the vinsert use Port5, it is bottleneck on Haswell

>+    CLIPW  m0, m4, m5
>+    movu        [r0],       xm0         ; row 0 of dst
>+    vextracti128 xm3, m0, 1
>+    movu        [r0 + r1],  xm3         ; row 1 of dst
>+    lea     r0,     [r0 + r1 * 2]
>+
>+    movu        xm0,        [r2]        ; row 2 of src0
>+    movu        xm1,        [r2 + r4]   ; row 3 of src0
>+    vinserti128 m0, m0, xm1, 1
>+
>+    movu    xm1,     [r3]               ; row 2 of src1
>+    movu    xm2,     [r3 + r5]          ; row 3 of src1
>+    vinserti128 m1, m1, xm2, 1
>+    lea     r2,     [r2 + r4 * 2]
>+    lea     r3,     [r3 + r5 * 2]
>+
>+    paddw   m0,     m1
>+    CLIPW  m0, m4, m5
>+    movu        [r0],       xm0         ; row 2 of dst
>+    vextracti128 xm3, m0, 1
>+    movu        [r0 + r1],  xm3         ; row 3 of dst
>+    lea     r0,     [r0 + r1 * 2]
>+
>+    dec     r6d
>+    jnz     .loop
>+    RET
> %else
> INIT_XMM sse4
> cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150303/f36c677d/attachment.html>


More information about the x265-devel mailing list