[x265] [PATCH] arm: Implement blockfill_s_neon ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Tue Mar 1 09:50:49 CET 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456814898 -19800
# Tue Mar 01 12:18:18 2016 +0530
# Node ID 79c00b9bc2b81afef2e41526fc3c390528f3174c
# Parent 291beccb67606494a9a144ca2cc4411ab3e21e50
arm: Implement blockfill_s_neon ARM NEON
diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Tue Mar 01 12:18:18 2016 +0530
@@ -42,6 +42,12 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // Block_fill
+ p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon);
+ p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon);
+ p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
+ p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
+
// Blockcopy_ss
p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/blockcopy8.S Tue Mar 01 12:18:18 2016 +0530
@@ -311,3 +311,52 @@
bne loop_css64
bx lr
endfunc
+
+// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
+function x265_blockfill_s_4x4_neon
+ lsl r1, #1
+.rept 2
+ vdup.u16 d0, r2
+ vdup.u16 d1, r2
+ vst1.16 {d0}, [r0], r1
+ vst1.16 {d1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockfill_s_8x8_neon
+ lsl r1, #1
+.rept 4
+ vdup.u16 q0, r2
+ vdup.u16 q1, r2
+ vst1.16 {q0}, [r0], r1
+ vst1.16 {q1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockfill_s_16x16_neon
+ lsl r1, #1
+.rept 16
+ vdup.u16 q0, r2
+ vdup.u16 q1, r2
+ vst1.16 {q0, q1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockfill_s_32x32_neon
+ lsl r1, #1
+ sub r1, #32
+ mov r12, #4
+loop_fill32:
+ subs r12, #1
+.rept 8
+ vdup.u16 q0, r2
+ vdup.u16 q1, r2
+ vst1.16 {q0, q1}, [r0]!
+ vst1.16 {q0, q1}, [r0], r1
+.endr
+ bne loop_fill32
+ bx lr
+endfunc
diff -r 291beccb6760 -r 79c00b9bc2b8 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Fri Feb 26 16:23:56 2016 +0530
+++ b/source/common/arm/blockcopy8.h Tue Mar 01 12:18:18 2016 +0530
@@ -74,4 +74,9 @@
void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockfill_s_4x4_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_8x8_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_16x16_neon(int16_t* dst, intptr_t dstride, int16_t val);
+void x265_blockfill_s_32x32_neon(int16_t* dst, intptr_t dstride, int16_t val);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list