<div dir="ltr">Smoke test set up is done on new ARM board. Smoke test is passed now you can push this patch.<div class="gmail_extra"><br><div class="gmail_quote"><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5"><div class="gmail_extra"><div class="gmail_quote"><br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Radhakrishnan VR <<a href="mailto:radhakrishnan@multicorewareinc.com" target="_blank">radhakrishnan@multicorewareinc.com</a>><br>
# Date 1458043815 -19800<br>
# Tue Mar 15 17:40:15 2016 +0530<br>
# Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea<br>
# Parent 4a2f94a592511afabd434fc6cf02a469b6d65091<br>
arm: Implement count_nonzero ARM NEON<br>
<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp<br>
--- a/source/common/arm/asm-primitives.cpp Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 17:40:15 2016 +0530<br>
@@ -43,6 +43,12 @@<br>
{<br>
if (cpuMask & X265_CPU_NEON)<br>
{<br>
+ // count nonzero<br>
+ <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);<br>
+ <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);<br>
+ <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);<br>
+ <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);<br>
+<br>
//scale2D_64to32<br>
p.scale2D_64to32 = PFX(scale2D_64to32_neon);<br>
<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S<br>
--- a/source/common/arm/blockcopy8.S Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/blockcopy8.S Tue Mar 15 17:40:15 2016 +0530<br>
@@ -457,3 +457,92 @@<br>
rsb r0, r12, #1024<br>
bx lr<br>
endfunc<br>
+<br>
+// int count_nonzero_c(const int16_t* quantCoeff)<br>
+function x265_count_nonzero_4_neon<br>
+ veor d4, d4<br>
+.rept 2<br>
+ vld1.s16 {d0}, [r0]!<br>
+ vld1.s16 {d1}, [r0]!<br>
+ vclz.i16 d2, d0<br>
+ vclz.i16 d3, d1<br>
+ vshr.u16 q1, #4<br>
+ vadd.u16 d2, d3<br>
+ vadd.u16 d4, d2<br>
+.endr<br>
+ vpadd.u16 d4, d4<br>
+ vpadd.u16 d4, d4<br>
+ vmov.u16 r12, d4[0]<br>
+ rsb r0, r12, #16<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_8_neon<br>
+ veor q8, q8<br>
+.rept 4<br>
+ vld1.s16 {q0}, [r0]!<br>
+ vld1.s16 {q1}, [r0]!<br>
+ vclz.i16 q2, q0<br>
+ vclz.i16 q3, q1<br>
+ vshr.u16 q2, #4<br>
+ vshr.u16 q3, #4<br>
+ vadd.u16 q2, q3<br>
+ vadd.u16 q8, q2<br>
+.endr<br>
+ vadd.u16 d16, d17<br>
+ vpadd.u16 d16, d16<br>
+ vpadd.u16 d16, d16<br>
+ vmov.u16 r12, d16[0]<br>
+ rsb r0, r12, #64<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_16_neon<br>
+ veor q2, q2<br>
+.rept 16<br>
+ vld1.s16 {q0, q1}, [r0]!<br>
+ vclz.i16 q8, q0<br>
+ vclz.i16 q9, q1<br>
+ vshr.u16 q8, #4<br>
+ vshr.u16 q9, #4<br>
+ vadd.u16 q8, q9<br>
+ vadd.u16 q2, q8<br>
+.endr<br>
+ vadd.u16 d4, d5<br>
+ vpadd.u16 d4, d4<br>
+ vpadd.u16 d4, d4<br>
+<br>
+ vmov.u16 r12, d4[0]<br>
+ rsb r0, r12, #256<br>
+ bx lr<br>
+endfunc<br>
+<br>
+function x265_count_nonzero_32_neon<br>
+ veor q12, q12<br>
+.rept 32<br>
+ vld1.s16 {q0, q1}, [r0]!<br>
+ vld1.s16 {q2, q3}, [r0]!<br>
+<br>
+ vclz.i16 q8, q0<br>
+ vclz.i16 q9, q1<br>
+ vclz.i16 q10, q2<br>
+ vclz.i16 q11, q3<br>
+<br>
+ vshr.u16 q8, #4<br>
+ vshr.u16 q9, #4<br>
+ vshr.u16 q10, #4<br>
+ vshr.u16 q11, #4<br>
+<br>
+ vadd.u16 q8, q9<br>
+ vadd.u16 q10, q11<br>
+ vadd.u16 q8, q10<br>
+ vadd.u16 q12, q8<br>
+.endr<br>
+ vadd.u16 d24, d25<br>
+ vpadd.u16 d24, d24<br>
+ vpadd.u16 d24, d24<br>
+<br>
+ vmov.u16 r12, d24[0]<br>
+ rsb r0, r12, #1024<br>
+ bx lr<br>
+endfunc<br>
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h<br>
--- a/source/common/arm/blockcopy8.h Wed Mar 09 14:34:06 2016 +0530<br>
+++ b/source/common/arm/blockcopy8.h Tue Mar 15 17:40:15 2016 +0530<br>
@@ -84,4 +84,9 @@<br>
uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);<br>
+<br>
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);<br>
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);<br>
#endif // ifndef X265_I386_PIXEL_ARM_H<br>
</blockquote></div><br></div>
</div></div></blockquote></div><br></div></div>