[x265] [PATCH] arm: Implement count_nonzero ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Tue Mar 15 13:12:04 CET 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1458043815 -19800
# Tue Mar 15 17:40:15 2016 +0530
# Node ID e5859c0bbdd9a5b12ce3a523b3857641bda457ea
# Parent 4a2f94a592511afabd434fc6cf02a469b6d65091
arm: Implement count_nonzero ARM NEON
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Mar 09 14:34:06 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Tue Mar 15 17:40:15 2016 +0530
@@ -43,6 +43,12 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // count nonzero
+ p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
+ p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
+ p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
+ p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
+
//scale2D_64to32
p.scale2D_64to32 = PFX(scale2D_64to32_neon);
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S Wed Mar 09 14:34:06 2016 +0530
+++ b/source/common/arm/blockcopy8.S Tue Mar 15 17:40:15 2016 +0530
@@ -457,3 +457,92 @@
rsb r0, r12, #1024
bx lr
endfunc
+
+// int count_nonzero_c(const int16_t* quantCoeff)
+function x265_count_nonzero_4_neon
+ veor d4, d4
+.rept 2
+ vld1.s16 {d0}, [r0]!
+ vld1.s16 {d1}, [r0]!
+ vclz.i16 d2, d0
+ vclz.i16 d3, d1
+ vshr.u16 q1, #4
+ vadd.u16 d2, d3
+ vadd.u16 d4, d2
+.endr
+ vpadd.u16 d4, d4
+ vpadd.u16 d4, d4
+ vmov.u16 r12, d4[0]
+ rsb r0, r12, #16
+ bx lr
+endfunc
+
+function x265_count_nonzero_8_neon
+ veor q8, q8
+.rept 4
+ vld1.s16 {q0}, [r0]!
+ vld1.s16 {q1}, [r0]!
+ vclz.i16 q2, q0
+ vclz.i16 q3, q1
+ vshr.u16 q2, #4
+ vshr.u16 q3, #4
+ vadd.u16 q2, q3
+ vadd.u16 q8, q2
+.endr
+ vadd.u16 d16, d17
+ vpadd.u16 d16, d16
+ vpadd.u16 d16, d16
+ vmov.u16 r12, d16[0]
+ rsb r0, r12, #64
+ bx lr
+endfunc
+
+function x265_count_nonzero_16_neon
+ veor q2, q2
+.rept 16
+ vld1.s16 {q0, q1}, [r0]!
+ vclz.i16 q8, q0
+ vclz.i16 q9, q1
+ vshr.u16 q8, #4
+ vshr.u16 q9, #4
+ vadd.u16 q8, q9
+ vadd.u16 q2, q8
+.endr
+ vadd.u16 d4, d5
+ vpadd.u16 d4, d4
+ vpadd.u16 d4, d4
+
+ vmov.u16 r12, d4[0]
+ rsb r0, r12, #256
+ bx lr
+endfunc
+
+function x265_count_nonzero_32_neon
+ veor q12, q12
+.rept 32
+ vld1.s16 {q0, q1}, [r0]!
+ vld1.s16 {q2, q3}, [r0]!
+
+ vclz.i16 q8, q0
+ vclz.i16 q9, q1
+ vclz.i16 q10, q2
+ vclz.i16 q11, q3
+
+ vshr.u16 q8, #4
+ vshr.u16 q9, #4
+ vshr.u16 q10, #4
+ vshr.u16 q11, #4
+
+ vadd.u16 q8, q9
+ vadd.u16 q10, q11
+ vadd.u16 q8, q10
+ vadd.u16 q12, q8
+.endr
+ vadd.u16 d24, d25
+ vpadd.u16 d24, d24
+ vpadd.u16 d24, d24
+
+ vmov.u16 r12, d24[0]
+ rsb r0, r12, #1024
+ bx lr
+endfunc
diff -r 4a2f94a59251 -r e5859c0bbdd9 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Wed Mar 09 14:34:06 2016 +0530
+++ b/source/common/arm/blockcopy8.h Tue Mar 15 17:40:15 2016 +0530
@@ -84,4 +84,9 @@
uint32_t x265_copy_cnt_8_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
uint32_t x265_copy_cnt_16_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
uint32_t x265_copy_cnt_32_neon(int16_t* coeff, const int16_t* residual, intptr_t resiStride);
+
+int x265_count_nonzero_4_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_8_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_16_neon(const int16_t* quantCoeff);
+int x265_count_nonzero_32_neon(const int16_t* quantCoeff);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list