[x265] [PATCH] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

Fri Jul 17 02:19:35 CEST 2015

Thanks, Min. Do we know what exactly caused this on some Macs - unaligned
access? The test farm Darwin machine has reported no smoke test failures.

On Fri, Jul 17, 2015 at 2:51 AM, Steve Borho <steve at borho.org> wrote:

> On 06/26, rajesh at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Rajesh Paulraj<rajesh at multicorewareinc.com>
> > # Date 1435311677 -19800
> > #      Fri Jun 26 15:11:17 2015 +0530
> > # Node ID 818b70b015513a01993af0c48e4714cf4fd8dc84
> > # Parent  956401f1a679f1e71181b704d64e4acdb6f1a93f
> > asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
> >
> > avx2:
> > planecopy_cp  19.36x   5685.80         110052.08
> >
> > sse4:
> > planecopy_cp  9.65x    10660.20        102850.27
>
> FYI: this primitive has introduced a SIGBUS on some of the main10 smoke
> tests on Mac.  Min is working on a rewrite of the primitive which should
> hopefully resolve this problem
>
> > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/asm-primitives.cpp
> > --- a/source/common/x86/asm-primitives.cpp    Fri Jun 26 15:01:16 2015
> +0530
> > +++ b/source/common/x86/asm-primitives.cpp    Fri Jun 26 15:11:17 2015
> +0530
> > @@ -1522,6 +1522,7 @@
> >          p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
> >          p.weight_pp = PFX(weight_pp_avx2);
> >          p.sign = PFX(calSign_avx2);
> > +        p.planecopy_cp = PFX(upShift_8_avx2);
> >
> >          p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
> >          p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
> > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel-a.asm
> > --- a/source/common/x86/pixel-a.asm   Fri Jun 26 15:01:16 2015 +0530
> > +++ b/source/common/x86/pixel-a.asm   Fri Jun 26 15:11:17 2015 +0530
> > @@ -7388,6 +7388,96 @@
> >  .end:
> >      RET
> >
> >
> +;---------------------------------------------------------------------------------------------------------------------
> > +;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, int shift)
> >
> +;---------------------------------------------------------------------------------------------------------------------
> > +%if ARCH_X86_64
> > +INIT_YMM avx2
> > +cglobal upShift_8, 7,8,3
> > +    movd        xm2, r6d
> > +    add         r3, r3
> > +
> > +.loopH:
> > +    xor         r7, r7
> > +    mov         r6d, r4d
> > +.loopW:
> > +    pmovzxbw    m0,[r0 + r7]
> > +    pmovzxbw    m1,[r0 + r7 + 16]
> > +    psllw       m0, xm2
> > +    psllw       m1, xm2
> > +    movu        [r2 + r7 * 2], m0
> > +    movu        [r2 + r7 * 2 + 32], m1
> > +
> > +    add         r7d, 32
> > +    sub         r6d, 32
> > +    jg          .loopW
> > +
> > +    ; move to next row
> > +    add         r0, r1
> > +    add         r2, r3
> > +    dec         r5d
> > +    jnz         .loopH
> > +
> > +;processing last row of every frame [To handle width which not a
> multiple of 16]
> > +
> > +.loop16:
> > +    pmovzxbw    m0,[r0]
> > +    psllw       m0, xm2
> > +    movu        [r2], m0
> > +
> > +    add         r0, mmsize
> > +    add         r2, 2 * mmsize
> > +    sub         r4d, 16
> > +    jg          .loop16
> > +    jz          .end
> > +
> > +    cmp         r4d, 8
> > +    jl          .process4
> > +    pmovzxbw    m0,[r0]
> > +    psllw       m0, xm2
> > +    movu        [r2], m0
> > +
> > +    add         r0, 8
> > +    add         r2, mmsize
> > +    sub         r4d, 8
> > +    jz          .end
> > +
> > +.process4:
> > +    cmp         r4d, 4
> > +    jl          .process2
> > +    movq        xm0,[r0]
> > +    pmovzxbw    m0,xm0
> > +    psllw       xm0, xm2
> > +    movq        [r2], xm0
> > +
> > +    add         r0, 4
> > +    add         r2, 8
> > +    sub         r4d, 4
> > +    jz          .end
> > +
> > +.process2:
> > +    cmp         r4d, 2
> > +    jl          .process1
> > +    movzx       r3d, byte [r0]
> > +    shl         r3d, 2
> > +    mov         [r2], r3w
> > +    movzx       r3d, byte [r0 + 1]
> > +    shl         r3d, 2
> > +    mov         [r2 + 2], r3w
> > +
> > +    add         r0, 2
> > +    add         r2, 4
> > +    sub         r4d, 2
> > +    jz          .end
> > +
> > +.process1:
> > +    movzx       r3d, byte [r0]
> > +    shl         r3d, 2
> > +    mov         [r2], r3w
> > +.end:
> > +    RET
> > +%endif
> > +
> >  %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
> >  %if cpuflag(ssse3)
> >      pabsd   %1, %3
> > diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel.h
> > --- a/source/common/x86/pixel.h       Fri Jun 26 15:01:16 2015 +0530
> > +++ b/source/common/x86/pixel.h       Fri Jun 26 15:11:17 2015 +0530
> > @@ -31,6 +31,7 @@
> >  void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride,
> pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t
> mask);
> >  void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride,
> pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t
> mask);
> >  void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int width, int height, int shift);
> > +void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int width, int height, int shift);
> >
> >  #define DECL_PIXELS(cpu) \
> >      FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const
> pixel*, intptr_t); \
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150717/cc0333f7/attachment.html>