<div dir="ltr">Please put this patch on hold. The smoke test is yet to be set up in our new ARM board. I will reply on this thread as soon as i finish running smoke test for this patch asap. </div><div class="gmail_extra"><br><div class="gmail_quote">On Tue, Mar 15, 2016 at 5:42 PM,  <span dir="ltr"><<a href="mailto:radhakrishnan@multicorewareinc.com" target="_blank">radhakrishnan@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Radhakrishnan VR <<a href="mailto:radhakrishnan@multicorewareinc.com">radhakrishnan@multicorewareinc.com</a>><br>
# Date 1458043815 -19800<br>
#      Tue Mar 15 17:40:15 2016 +0530<br>
# Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea<br>
# Parent  4a2f94a592511afabd434fc6cf02a469b6d65091<br>
arm: Implement count_nonzero ARM NEON<br>
<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp<br>
--- a/source/common/arm/asm-primitives.cpp      Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/asm-primitives.cpp      Tue Mar 15 17:40:15 2016 +0530<br>
@@ -43,6 +43,12 @@<br>
 {<br>
     if (cpuMask & X265_CPU_NEON)<br>
     {<br>
+        // count nonzero<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].count_nonzero     = PFX(count_nonzero_4_neon);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].count_nonzero     = PFX(count_nonzero_8_neon);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_16x16].count_nonzero   = PFX(count_nonzero_16_neon);<br>
+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_32x32].count_nonzero   = PFX(count_nonzero_32_neon);<br>
+<br>
         //scale2D_64to32<br>
         p.scale2D_64to32  = PFX(scale2D_64to32_neon);<br>
<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S<br>
--- a/source/common/arm/blockcopy8.S    Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/blockcopy8.S    Tue Mar 15 17:40:15 2016 +0530<br>
@@ -457,3 +457,92 @@<br>
     rsb             r0, r12, #1024<br>
     bx              lr<br>
 endfunc<br>
+<br>
+// int  count_nonzero_c(const int16_t* quantCoeff)<br>
+function x265_count_nonzero_4_neon<br>
+    veor            d4, d4<br>
+.rept 2<br>
+    vld1.s16        {d0}, [r0]!<br>
+    vld1.s16        {d1}, [r0]!<br>
+    vclz.i16        d2, d0<br>
+    vclz.i16        d3, d1<br>
+    vshr.u16        q1, #4<br>
+    vadd.u16        d2, d3<br>
+    vadd.u16        d4, d2<br>
+.endr<br>
+    vpadd.u16       d4, d4<br>
+    vpadd.u16       d4, d4<br>
+    vmov.u16        r12, d4[0]<br>
+    rsb             r0, r12, #16<br>
+    bx              lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_8_neon<br>
+    veor            q8, q8<br>
+.rept 4<br>
+    vld1.s16        {q0}, [r0]!<br>
+    vld1.s16        {q1}, [r0]!<br>
+    vclz.i16        q2, q0<br>
+    vclz.i16        q3, q1<br>
+    vshr.u16        q2, #4<br>
+    vshr.u16        q3, #4<br>
+    vadd.u16        q2, q3<br>
+    vadd.u16        q8, q2<br>
+.endr<br>
+    vadd.u16        d16, d17<br>
+    vpadd.u16       d16, d16<br>
+    vpadd.u16       d16, d16<br>
+    vmov.u16        r12, d16[0]<br>
+    rsb             r0, r12, #64<br>
+    bx              lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_16_neon<br>
+    veor            q2, q2<br>
+.rept 16<br>
+    vld1.s16        {q0, q1}, [r0]!<br>
+    vclz.i16        q8, q0<br>
+    vclz.i16        q9, q1<br>
+    vshr.u16        q8, #4<br>
+    vshr.u16        q9, #4<br>
+    vadd.u16        q8, q9<br>
+    vadd.u16        q2, q8<br>
+.endr<br>
+    vadd.u16        d4, d5<br>
+    vpadd.u16       d4, d4<br>
+    vpadd.u16       d4, d4<br>
+<br>
+    vmov.u16        r12, d4[0]<br>
+    rsb             r0, r12, #256<br>
+    bx              lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_32_neon<br>
+    veor            q12, q12<br>
+.rept 32<br>
+    vld1.s16        {q0, q1}, [r0]!<br>
+    vld1.s16        {q2, q3}, [r0]!<br>
+<br>
+    vclz.i16        q8, q0<br>
+    vclz.i16        q9, q1<br>
+    vclz.i16        q10, q2<br>
+    vclz.i16        q11, q3<br>
+<br>
+    vshr.u16        q8, #4<br>
+    vshr.u16        q9, #4<br>
+    vshr.u16        q10, #4<br>
+    vshr.u16        q11, #4<br>
+<br>
+    vadd.u16        q8, q9<br>
+    vadd.u16        q10, q11<br>
+    vadd.u16        q8, q10<br>
+    vadd.u16        q12, q8<br>
+.endr<br>
+    vadd.u16        d24, d25<br>
+    vpadd.u16       d24, d24<br>
+    vpadd.u16       d24, d24<br>
+<br>
+    vmov.u16        r12, d24[0]<br>
+    rsb             r0, r12, #1024<br>
+    bx              lr<br>
+endfunc<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h<br>
--- a/source/common/arm/blockcopy8.h    Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/blockcopy8.h    Tue Mar 15 17:40:15 2016 +0530<br>
@@ -84,4 +84,9 @@<br>
 uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
 uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
 uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
+<br>
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);<br>
 #endif // ifndef X265_I386_PIXEL_ARM_H<br>
</blockquote></div><br></div>