[x265] [PATCH] arm: Implement count_nonzero ARM NEON
Radhakrishnan Venugopal Rajaganesan
radhakrishnan at multicorewareinc.com
Wed Mar 16 11:54:22 CET 2016
Smoke test set up is done on new ARM board. Smoke test is passed now you
can push this patch.
> # HG changeset patch
>> # User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
>> # Date 1458043815 -19800
>> # Tue Mar 15 17:40:15 2016 +0530
>> # Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea
>> # Parent 4a2f94a592511afabd434fc6cf02a469b6d65091
>> arm: Implement count_nonzero ARM NEON
>>
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp
>> --- a/source/common/arm/asm-primitives.cpp Wed Mar 09 14:34:06 2016
>> +0530
>> +++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 17:40:15 2016
>> +0530
>> @@ -43,6 +43,12 @@
>> {
>> if (cpuMask & X265_CPU_NEON)
>> {
>> + // count nonzero
>> + p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
>> + p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
>> + p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
>> + p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
>> +
>> //scale2D_64to32
>> p.scale2D_64to32 = PFX(scale2D_64to32_neon);
>>
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S
>> --- a/source/common/arm/blockcopy8.S Wed Mar 09 14:34:06 2016 +0530
>> +++ b/source/common/arm/blockcopy8.S Tue Mar 15 17:40:15 2016 +0530
>> @@ -457,3 +457,92 @@
>> rsb r0, r12, #1024
>> bx lr
>> endfunc
>> +
>> +// int count_nonzero_c(const int16_t* quantCoeff)
>> +function x265_count_nonzero_4_neon
>> + veor d4, d4
>> +.rept 2
>> + vld1.s16 {d0}, [r0]!
>> + vld1.s16 {d1}, [r0]!
>> + vclz.i16 d2, d0
>> + vclz.i16 d3, d1
>> + vshr.u16 q1, #4
>> + vadd.u16 d2, d3
>> + vadd.u16 d4, d2
>> +.endr
>> + vpadd.u16 d4, d4
>> + vpadd.u16 d4, d4
>> + vmov.u16 r12, d4[0]
>> + rsb r0, r12, #16
>> + bx lr
>> +endfunc
>> +
>> +function x265_count_nonzero_8_neon
>> + veor q8, q8
>> +.rept 4
>> + vld1.s16 {q0}, [r0]!
>> + vld1.s16 {q1}, [r0]!
>> + vclz.i16 q2, q0
>> + vclz.i16 q3, q1
>> + vshr.u16 q2, #4
>> + vshr.u16 q3, #4
>> + vadd.u16 q2, q3
>> + vadd.u16 q8, q2
>> +.endr
>> + vadd.u16 d16, d17
>> + vpadd.u16 d16, d16
>> + vpadd.u16 d16, d16
>> + vmov.u16 r12, d16[0]
>> + rsb r0, r12, #64
>> + bx lr
>> +endfunc
>> +
>> +function x265_count_nonzero_16_neon
>> + veor q2, q2
>> +.rept 16
>> + vld1.s16 {q0, q1}, [r0]!
>> + vclz.i16 q8, q0
>> + vclz.i16 q9, q1
>> + vshr.u16 q8, #4
>> + vshr.u16 q9, #4
>> + vadd.u16 q8, q9
>> + vadd.u16 q2, q8
>> +.endr
>> + vadd.u16 d4, d5
>> + vpadd.u16 d4, d4
>> + vpadd.u16 d4, d4
>> +
>> + vmov.u16 r12, d4[0]
>> + rsb r0, r12, #256
>> + bx lr
>> +endfunc
>> +
>> +function x265_count_nonzero_32_neon
>> + veor q12, q12
>> +.rept 32
>> + vld1.s16 {q0, q1}, [r0]!
>> + vld1.s16 {q2, q3}, [r0]!
>> +
>> + vclz.i16 q8, q0
>> + vclz.i16 q9, q1
>> + vclz.i16 q10, q2
>> + vclz.i16 q11, q3
>> +
>> + vshr.u16 q8, #4
>> + vshr.u16 q9, #4
>> + vshr.u16 q10, #4
>> + vshr.u16 q11, #4
>> +
>> + vadd.u16 q8, q9
>> + vadd.u16 q10, q11
>> + vadd.u16 q8, q10
>> + vadd.u16 q12, q8
>> +.endr
>> + vadd.u16 d24, d25
>> + vpadd.u16 d24, d24
>> + vpadd.u16 d24, d24
>> +
>> + vmov.u16 r12, d24[0]
>> + rsb r0, r12, #1024
>> + bx lr
>> +endfunc
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h
>> --- a/source/common/arm/blockcopy8.h Wed Mar 09 14:34:06 2016 +0530
>> +++ b/source/common/arm/blockcopy8.h Tue Mar 15 17:40:15 2016 +0530
>> @@ -84,4 +84,9 @@
>> uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>> uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>> uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>> +
>> +int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
>> #endif // ifndef X265_I386_PIXEL_ARM_H
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160316/903c0945/attachment-0001.html>
More information about the x265-devel
mailing list