[x265] [PATCH] arm: Implement count_nonzero ARM NEON
Radhakrishnan Venugopal Rajaganesan
radhakrishnan at multicorewareinc.com
Wed Mar 16 05:47:47 CET 2016
Please put this patch on hold. The smoke test is yet to be set up in our
new ARM board. I will reply on this thread as soon as i finish running
smoke test for this patch asap.
On Tue, Mar 15, 2016 at 5:42 PM, <radhakrishnan at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
> # Date 1458043815 -19800
> # Tue Mar 15 17:40:15 2016 +0530
> # Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea
> # Parent 4a2f94a592511afabd434fc6cf02a469b6d65091
> arm: Implement count_nonzero ARM NEON
>
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp
> --- a/source/common/arm/asm-primitives.cpp Wed Mar 09 14:34:06 2016
> +0530
> +++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 17:40:15 2016
> +0530
> @@ -43,6 +43,12 @@
> {
> if (cpuMask & X265_CPU_NEON)
> {
> + // count nonzero
> + p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
> + p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
> + p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
> + p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
> +
> //scale2D_64to32
> p.scale2D_64to32 = PFX(scale2D_64to32_neon);
>
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S
> --- a/source/common/arm/blockcopy8.S Wed Mar 09 14:34:06 2016 +0530
> +++ b/source/common/arm/blockcopy8.S Tue Mar 15 17:40:15 2016 +0530
> @@ -457,3 +457,92 @@
> rsb r0, r12, #1024
> bx lr
> endfunc
> +
> +// int count_nonzero_c(const int16_t* quantCoeff)
> +function x265_count_nonzero_4_neon
> + veor d4, d4
> +.rept 2
> + vld1.s16 {d0}, [r0]!
> + vld1.s16 {d1}, [r0]!
> + vclz.i16 d2, d0
> + vclz.i16 d3, d1
> + vshr.u16 q1, #4
> + vadd.u16 d2, d3
> + vadd.u16 d4, d2
> +.endr
> + vpadd.u16 d4, d4
> + vpadd.u16 d4, d4
> + vmov.u16 r12, d4[0]
> + rsb r0, r12, #16
> + bx lr
> +endfunc
> +
> +function x265_count_nonzero_8_neon
> + veor q8, q8
> +.rept 4
> + vld1.s16 {q0}, [r0]!
> + vld1.s16 {q1}, [r0]!
> + vclz.i16 q2, q0
> + vclz.i16 q3, q1
> + vshr.u16 q2, #4
> + vshr.u16 q3, #4
> + vadd.u16 q2, q3
> + vadd.u16 q8, q2
> +.endr
> + vadd.u16 d16, d17
> + vpadd.u16 d16, d16
> + vpadd.u16 d16, d16
> + vmov.u16 r12, d16[0]
> + rsb r0, r12, #64
> + bx lr
> +endfunc
> +
> +function x265_count_nonzero_16_neon
> + veor q2, q2
> +.rept 16
> + vld1.s16 {q0, q1}, [r0]!
> + vclz.i16 q8, q0
> + vclz.i16 q9, q1
> + vshr.u16 q8, #4
> + vshr.u16 q9, #4
> + vadd.u16 q8, q9
> + vadd.u16 q2, q8
> +.endr
> + vadd.u16 d4, d5
> + vpadd.u16 d4, d4
> + vpadd.u16 d4, d4
> +
> + vmov.u16 r12, d4[0]
> + rsb r0, r12, #256
> + bx lr
> +endfunc
> +
> +function x265_count_nonzero_32_neon
> + veor q12, q12
> +.rept 32
> + vld1.s16 {q0, q1}, [r0]!
> + vld1.s16 {q2, q3}, [r0]!
> +
> + vclz.i16 q8, q0
> + vclz.i16 q9, q1
> + vclz.i16 q10, q2
> + vclz.i16 q11, q3
> +
> + vshr.u16 q8, #4
> + vshr.u16 q9, #4
> + vshr.u16 q10, #4
> + vshr.u16 q11, #4
> +
> + vadd.u16 q8, q9
> + vadd.u16 q10, q11
> + vadd.u16 q8, q10
> + vadd.u16 q12, q8
> +.endr
> + vadd.u16 d24, d25
> + vpadd.u16 d24, d24
> + vpadd.u16 d24, d24
> +
> + vmov.u16 r12, d24[0]
> + rsb r0, r12, #1024
> + bx lr
> +endfunc
> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h
> --- a/source/common/arm/blockcopy8.h Wed Mar 09 14:34:06 2016 +0530
> +++ b/source/common/arm/blockcopy8.h Tue Mar 15 17:40:15 2016 +0530
> @@ -84,4 +84,9 @@
> uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
> uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
> uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual,
> intptr_t resiStride);
> +
> +int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
> +int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
> #endif // ifndef X265_I386_PIXEL_ARM_H
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160316/728fec21/attachment.html>
More information about the x265-devel
mailing list