[x265] [PATCH] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

Deepthi Nandakumar deepthi at multicorewareinc.com
Fri Jun 26 11:06:02 CEST 2015


Cannot apply. Can you please update this to the current tip?

On Thu, Jun 25, 2015 at 2:20 PM, <rajesh at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
> # Date 1435220155 -19800
> #      Thu Jun 25 13:45:55 2015 +0530
> # Node ID 26e8eff8eb5abc1c2fa5dd94f59f620c6040caf9
> # Parent  430625004ef81ba9e9e398d4cf12a68a1cd4b664
> asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
>
> avx2:
> planecopy_cp  19.36x   5685.80         110052.08
>
> sse4:
> planecopy_cp  9.65x    10660.20        102850.27
>
> diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Thu Jun 25 13:34:17 2015
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Thu Jun 25 13:45:55 2015
> +0530
> @@ -1497,6 +1497,7 @@
>          p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
>          p.weight_pp = PFX(weight_pp_avx2);
>          p.sign = x265_calculateSign_avx2;
> +        p.planecopy_cp = PFX(upShift_8_avx2);
>
>          p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
>          p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
> diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm     Thu Jun 25 13:34:17 2015 +0530
> +++ b/source/common/x86/pixel-a.asm     Thu Jun 25 13:45:55 2015 +0530
> @@ -7388,6 +7388,96 @@
>  .end:
>      RET
>
>
> +;---------------------------------------------------------------------------------------------------------------------
> +;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t
> dstStride, int width, int height, int shift)
>
> +;---------------------------------------------------------------------------------------------------------------------
> +%if ARCH_X86_64
> +INIT_YMM avx2
> +cglobal upShift_8, 7,8,3
> +    movd        xm2, r6d
> +    add         r3, r3
> +
> +.loopH:
> +    xor         r7, r7
> +    mov         r6d, r4d
> +.loopW:
> +    pmovzxbw    m0,[r0 + r7]
> +    pmovzxbw    m1,[r0 + r7 + 16]
> +    psllw       m0, xm2
> +    psllw       m1, xm2
> +    movu        [r2 + r7 * 2], m0
> +    movu        [r2 + r7 * 2 + 32], m1
> +
> +    add         r7d, 32
> +    sub         r6d, 32
> +    jg          .loopW
> +
> +    ; move to next row
> +    add         r0, r1
> +    add         r2, r3
> +    dec         r5d
> +    jnz         .loopH
> +
> +;processing last row of every frame [To handle width which not a multiple
> of 16]
> +
> +.loop16:
> +    pmovzxbw    m0,[r0]
> +    psllw       m0, xm2
> +    movu        [r2], m0
> +
> +    add         r0, mmsize
> +    add         r2, 2 * mmsize
> +    sub         r4d, 16
> +    jg          .loop16
> +    jz          .end
> +
> +    cmp         r4d, 8
> +    jl          .process4
> +    pmovzxbw    m0,[r0]
> +    psllw       m0, xm2
> +    movu        [r2], m0
> +
> +    add         r0, 8
> +    add         r2, mmsize
> +    sub         r4d, 8
> +    jz          .end
> +
> +.process4:
> +    cmp         r4d, 4
> +    jl          .process2
> +    movq        xm0,[r0]
> +    pmovzxbw    m0,xm0
> +    psllw       xm0, xm2
> +    movq        [r2], xm0
> +
> +    add         r0, 4
> +    add         r2, 8
> +    sub         r4d, 4
> +    jz          .end
> +
> +.process2:
> +    cmp         r4d, 2
> +    jl          .process1
> +    movzx       r3d, byte [r0]
> +    shl         r3d, 2
> +    mov         [r2], r3w
> +    movzx       r3d, byte [r0 + 1]
> +    shl         r3d, 2
> +    mov         [r2 + 2], r3w
> +
> +    add         r0, 2
> +    add         r2, 4
> +    sub         r4d, 2
> +    jz          .end
> +
> +.process1:
> +    movzx       r3d, byte [r0]
> +    shl         r3d, 2
> +    mov         [r2], r3w
> +.end:
> +    RET
> +%endif
> +
>  %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
>  %if cpuflag(ssse3)
>      pabsd   %1, %3
> diff -r 430625004ef8 -r 26e8eff8eb5a source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Thu Jun 25 13:34:17 2015 +0530
> +++ b/source/common/x86/pixel.h Thu Jun 25 13:45:55 2015 +0530
> @@ -31,6 +31,7 @@
>  void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride,
> pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t
> mask);
>  void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride,
> pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t
> mask);
>  void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int width, int height, int shift);
> +void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int width, int height, int shift);
>
>  #define DECL_PIXELS(cpu) \
>      FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*,
> intptr_t); \
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150626/f92e94d2/attachment.html>


More information about the x265-devel mailing list