[x265] [PATCH] arm: Implement count_nonzero ARM NEON

Radhakrishnan Venugopal Rajaganesan radhakrishnan at multicorewareinc.com
Wed Mar 16 05:47:47 CET 2016


Please put this patch on hold. The smoke test is yet to be set up in our
new ARM board. I will reply on this thread as soon as i finish running
smoke test for this patch asap.

On Tue, Mar 15, 2016 at 5:42 PM, <radhakrishnan at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
> # Date 1458043815 -19800
> #      Tue Mar 15 17:40:15 2016 +0530
> # Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea
> # Parent  4a2f94a592511afabd434fc6cf02a469b6d65091
> arm: Implement count_nonzero ARM NEON
>
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp
> --- a/source/common/arm/asm-primitives.cpp      Wed Mar 09 14:34:06 2016
> +0530
> +++ b/source/common/arm/asm-primitives.cpp      Tue Mar 15 17:40:15 2016
> +0530
> @@ -43,6 +43,12 @@
>  {
>      if (cpuMask & X265_CPU_NEON)
>      {
> +        // count nonzero
> +        p.cu[BLOCK_4x4].count_nonzero     = PFX(count_nonzero_4_neon);
> +        p.cu[BLOCK_8x8].count_nonzero     = PFX(count_nonzero_8_neon);
> +        p.cu[BLOCK_16x16].count_nonzero   = PFX(count_nonzero_16_neon);
> +        p.cu[BLOCK_32x32].count_nonzero   = PFX(count_nonzero_32_neon);
> +
>          //scale2D_64to32
>          p.scale2D_64to32  = PFX(scale2D_64to32_neon);
>
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S
> --- a/source/common/arm/blockcopy8.S    Wed Mar 09 14:34:06 2016 +0530
> +++ b/source/common/arm/blockcopy8.S    Tue Mar 15 17:40:15 2016 +0530
> @@ -457,3 +457,92 @@
>      rsb             r0, r12, #1024
>      bx              lr
>  endfunc
> +
> +// int  count_nonzero_c(const int16_t* quantCoeff)
> +function x265_count_nonzero_4_neon
> +    veor            d4, d4
> +.rept 2
> +    vld1.s16        {d0}, [r0]!
> +    vld1.s16        {d1}, [r0]!
> +    vclz.i16        d2, d0
> +    vclz.i16        d3, d1
> +    vshr.u16        q1, #4
> +    vadd.u16        d2, d3
> +    vadd.u16        d4, d2
> +.endr
> +    vpadd.u16       d4, d4
> +    vpadd.u16       d4, d4
> +    vmov.u16        r12, d4[0]
> +    rsb             r0, r12, #16
> +    bx              lr
> +endfunc
> +
> +function x265_count_nonzero_8_neon
> +    veor            q8, q8
> +.rept 4
> +    vld1.s16        {q0}, [r0]!
> +    vld1.s16        {q1}, [r0]!
> +    vclz.i16        q2, q0
> +    vclz.i16        q3, q1
> +    vshr.u16        q2, #4
> +    vshr.u16        q3, #4
> +    vadd.u16        q2, q3
> +    vadd.u16        q8, q2
> +.endr
> +    vadd.u16        d16, d17
> +    vpadd.u16       d16, d16
> +    vpadd.u16       d16, d16
> +    vmov.u16        r12, d16[0]
> +    rsb             r0, r12, #64
> +    bx              lr
> +endfunc
> +
> +function x265_count_nonzero_16_neon
> +    veor            q2, q2
> +.rept 16
> +    vld1.s16        {q0, q1}, [r0]!
> +    vclz.i16        q8, q0
> +    vclz.i16        q9, q1
> +    vshr.u16        q8, #4
> +    vshr.u16        q9, #4
> +    vadd.u16        q8, q9
> +    vadd.u16        q2, q8
> +.endr
> +    vadd.u16        d4, d5
> +    vpadd.u16       d4, d4
> +    vpadd.u16       d4, d4
> +
> +    vmov.u16        r12, d4[0]
> +    rsb             r0, r12, #256
> +    bx              lr
> +endfunc
> +
> +function x265_count_nonzero_32_neon
> +    veor            q12, q12
> +.rept 32
> +    vld1.s16        {q0, q1}, [r0]!
> +    vld1.s16        {q2, q3}, [r0]!
> +
> +    vclz.i16        q8, q0
> +    vclz.i16        q9, q1
> +    vclz.i16        q10, q2
> +    vclz.i16        q11, q3
> +
> +    vshr.u16        q8, #4
> +    vshr.u16        q9, #4
> +    vshr.u16        q10, #4
> +    vshr.u16        q11, #4
> +
> +    vadd.u16        q8, q9
> +    vadd.u16        q10, q11
> +    vadd.u16        q8, q10
> +    vadd.u16        q12, q8
> +.endr
> +    vadd.u16        d24, d25
> +    vpadd.u16       d24, d24
> +    vpadd.u16       d24, d24
> +
> +    vmov.u16        r12, d24[0]
> +    rsb             r0, r12, #1024
> +    bx              lr
> +endfunc
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h
> --- a/source/common/arm/blockcopy8.h    Wed Mar 09 14:34:06 2016 +0530
> +++ b/source/common/arm/blockcopy8.h    Tue Mar 15 17:40:15 2016 +0530
> @@ -84,4 +84,9 @@
>  uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
>  uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
>  uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
> +
> +int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
>  #endif // ifndef X265_I386_PIXEL_ARM_H
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160316/728fec21/attachment.html>


More information about the x265-devel mailing list