[x265] [PATCH] arm: Implement count_nonzero ARM NEON

Radhakrishnan Venugopal Rajaganesan radhakrishnan at multicorewareinc.com
Wed Mar 16 11:54:22 CET 2016


Smoke test set up is done on new ARM board. Smoke test is passed now you
can push this patch.


> # HG changeset patch
>> # User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
>> # Date 1458043815 -19800
>> #      Tue Mar 15 17:40:15 2016 +0530
>> # Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea
>> # Parent  4a2f94a592511afabd434fc6cf02a469b6d65091
>> arm: Implement count_nonzero ARM NEON
>>
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp
>> --- a/source/common/arm/asm-primitives.cpp      Wed Mar 09 14:34:06 2016
>> +0530
>> +++ b/source/common/arm/asm-primitives.cpp      Tue Mar 15 17:40:15 2016
>> +0530
>> @@ -43,6 +43,12 @@
>>  {
>>      if (cpuMask & X265_CPU_NEON)
>>      {
>> +        // count nonzero
>> +        p.cu[BLOCK_4x4].count_nonzero     = PFX(count_nonzero_4_neon);
>> +        p.cu[BLOCK_8x8].count_nonzero     = PFX(count_nonzero_8_neon);
>> +        p.cu[BLOCK_16x16].count_nonzero   = PFX(count_nonzero_16_neon);
>> +        p.cu[BLOCK_32x32].count_nonzero   = PFX(count_nonzero_32_neon);
>> +
>>          //scale2D_64to32
>>          p.scale2D_64to32  = PFX(scale2D_64to32_neon);
>>
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S
>> --- a/source/common/arm/blockcopy8.S    Wed Mar 09 14:34:06 2016 +0530
>> +++ b/source/common/arm/blockcopy8.S    Tue Mar 15 17:40:15 2016 +0530
>> @@ -457,3 +457,92 @@
>>      rsb             r0, r12, #1024
>>      bx              lr
>>  endfunc
>> +
>> +// int  count_nonzero_c(const int16_t* quantCoeff)
>> +function x265_count_nonzero_4_neon
>> +    veor            d4, d4
>> +.rept 2
>> +    vld1.s16        {d0}, [r0]!
>> +    vld1.s16        {d1}, [r0]!
>> +    vclz.i16        d2, d0
>> +    vclz.i16        d3, d1
>> +    vshr.u16        q1, #4
>> +    vadd.u16        d2, d3
>> +    vadd.u16        d4, d2
>> +.endr
>> +    vpadd.u16       d4, d4
>> +    vpadd.u16       d4, d4
>> +    vmov.u16        r12, d4[0]
>> +    rsb             r0, r12, #16
>> +    bx              lr
>> +endfunc
>> +
>> +function x265_count_nonzero_8_neon
>> +    veor            q8, q8
>> +.rept 4
>> +    vld1.s16        {q0}, [r0]!
>> +    vld1.s16        {q1}, [r0]!
>> +    vclz.i16        q2, q0
>> +    vclz.i16        q3, q1
>> +    vshr.u16        q2, #4
>> +    vshr.u16        q3, #4
>> +    vadd.u16        q2, q3
>> +    vadd.u16        q8, q2
>> +.endr
>> +    vadd.u16        d16, d17
>> +    vpadd.u16       d16, d16
>> +    vpadd.u16       d16, d16
>> +    vmov.u16        r12, d16[0]
>> +    rsb             r0, r12, #64
>> +    bx              lr
>> +endfunc
>> +
>> +function x265_count_nonzero_16_neon
>> +    veor            q2, q2
>> +.rept 16
>> +    vld1.s16        {q0, q1}, [r0]!
>> +    vclz.i16        q8, q0
>> +    vclz.i16        q9, q1
>> +    vshr.u16        q8, #4
>> +    vshr.u16        q9, #4
>> +    vadd.u16        q8, q9
>> +    vadd.u16        q2, q8
>> +.endr
>> +    vadd.u16        d4, d5
>> +    vpadd.u16       d4, d4
>> +    vpadd.u16       d4, d4
>> +
>> +    vmov.u16        r12, d4[0]
>> +    rsb             r0, r12, #256
>> +    bx              lr
>> +endfunc
>> +
>> +function x265_count_nonzero_32_neon
>> +    veor            q12, q12
>> +.rept 32
>> +    vld1.s16        {q0, q1}, [r0]!
>> +    vld1.s16        {q2, q3}, [r0]!
>> +
>> +    vclz.i16        q8, q0
>> +    vclz.i16        q9, q1
>> +    vclz.i16        q10, q2
>> +    vclz.i16        q11, q3
>> +
>> +    vshr.u16        q8, #4
>> +    vshr.u16        q9, #4
>> +    vshr.u16        q10, #4
>> +    vshr.u16        q11, #4
>> +
>> +    vadd.u16        q8, q9
>> +    vadd.u16        q10, q11
>> +    vadd.u16        q8, q10
>> +    vadd.u16        q12, q8
>> +.endr
>> +    vadd.u16        d24, d25
>> +    vpadd.u16       d24, d24
>> +    vpadd.u16       d24, d24
>> +
>> +    vmov.u16        r12, d24[0]
>> +    rsb             r0, r12, #1024
>> +    bx              lr
>> +endfunc
>> diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h
>> --- a/source/common/arm/blockcopy8.h    Wed Mar 09 14:34:06 2016 +0530
>> +++ b/source/common/arm/blockcopy8.h    Tue Mar 15 17:40:15 2016 +0530
>> @@ -84,4 +84,9 @@
>>  uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>>  uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>>  uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual,
>> intptr_t resiStride);
>> +
>> +int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
>> +int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
>>  #endif // ifndef X265_I386_PIXEL_ARM_H
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160316/903c0945/attachment-0001.html>


More information about the x265-devel mailing list