[x265] [PATCH] arm: Implement blockcopy_ss_neon ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Mon Feb 29 10:15:36 CET 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456484036 -19800
# Fri Feb 26 16:23:56 2016 +0530
# Node ID 3363b15c06e563cb119085facfd190757cae3945
# Parent cec0be1e488fdae48c37e49b9d7cb3e52944917b
arm: Implement blockcopy_ss_neon ARM NEON
diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Fri Feb 26 16:23:56 2016 +0530
@@ -42,12 +42,12 @@
{
if (cpuMask & X265_CPU_NEON)
{
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
- p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+ // Blockcopy_ss
+ p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
+ p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
+ p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
+ p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
+ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
// Blockcopy_ps
p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
@@ -56,6 +56,13 @@
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
+ // Blockcopy_sp
+ p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
+ p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
+ p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
+ p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
+ p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+
// pixel_add_ps
p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_neon);
p.cu[BLOCK_8x8].add_ps = PFX(pixel_add_ps_8x8_neon);
diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/blockcopy8.S Fri Feb 26 16:23:56 2016 +0530
@@ -233,3 +233,81 @@
bne loop_cps64
bx lr
endfunc
+
+// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
+function x265_blockcopy_ss_4x4_neon
+ lsl r1, #1
+ lsl r3, #1
+.rept 2
+ vld1.u16 {d0}, [r2], r3
+ vld1.u16 {d1}, [r2], r3
+ vst1.u16 {d0}, [r0], r1
+ vst1.u16 {d1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ss_8x8_neon
+ lsl r1, #1
+ lsl r3, #1
+.rept 4
+ vld1.u16 {q0}, [r2], r3
+ vld1.u16 {q1}, [r2], r3
+ vst1.u16 {q0}, [r0], r1
+ vst1.u16 {q1}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ss_16x16_neon
+ lsl r1, #1
+ lsl r3, #1
+.rept 8
+ vld1.u16 {q0, q1}, [r2], r3
+ vld1.u16 {q2, q3}, [r2], r3
+ vst1.u16 {q0, q1}, [r0], r1
+ vst1.u16 {q2, q3}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ss_32x32_neon
+ lsl r1, #1
+ lsl r3, #1
+ mov r12, #4
+ sub r1, #32
+ sub r3, #32
+loop_css32:
+ subs r12, #1
+.rept 8
+ vld1.u16 {q0, q1}, [r2]!
+ vld1.u16 {q2, q3}, [r2], r3
+ vst1.u16 {q0, q1}, [r0]!
+ vst1.u16 {q2, q3}, [r0], r1
+.endr
+ bne loop_css32
+ bx lr
+endfunc
+
+function x265_blockcopy_ss_64x64_neon
+ lsl r1, #1
+ lsl r3, #1
+ mov r12, #8
+ sub r1, #96
+ sub r3, #96
+loop_css64:
+ subs r12, #1
+.rept 8
+ vld1.u16 {q0, q1}, [r2]!
+ vld1.u16 {q2, q3}, [r2]!
+ vld1.u16 {q8, q9}, [r2]!
+ vld1.u16 {q10, q11}, [r2], r3
+
+ vst1.u16 {q0, q1}, [r0]!
+ vst1.u16 {q2, q3}, [r0]!
+ vst1.u16 {q8, q9}, [r0]!
+ vst1.u16 {q10, q11}, [r0], r1
+.endr
+ bne loop_css64
+ bx lr
+endfunc
diff -r cec0be1e488f -r 3363b15c06e5 source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Fri Feb 26 15:30:16 2016 +0530
+++ b/source/common/arm/blockcopy8.h Fri Feb 26 16:23:56 2016 +0530
@@ -68,4 +68,10 @@
void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+
+void x265_blockcopy_ss_4x4_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_8x8_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_16x16_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_32x32_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+void x265_blockcopy_ss_64x64_neon(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list