[x265] [PATCH] arm: Implement blockcopy_ps_neon ARM NEON
radhakrishnan at multicorewareinc.com
radhakrishnan at multicorewareinc.com
Mon Feb 29 10:10:40 CET 2016
# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1456480816 -19800
# Fri Feb 26 15:30:16 2016 +0530
# Node ID cec0be1e488fdae48c37e49b9d7cb3e52944917b
# Parent e7fe951785981cfe16b85d96e0f179acd946eaa6
arm: Implement blockcopy_ps_neon ARM NEON
diff -r e7fe95178598 -r cec0be1e488f source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Fri Feb 26 15:30:16 2016 +0530
@@ -49,6 +49,13 @@
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
+ // Blockcopy_ps
+ p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
+ p.cu[BLOCK_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
+ p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
+ p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
+ p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
+
// pixel_add_ps
p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_neon);
p.cu[BLOCK_8x8].add_ps = PFX(pixel_add_ps_8x8_neon);
diff -r e7fe95178598 -r cec0be1e488f source/common/arm/blockcopy8.S
--- a/source/common/arm/blockcopy8.S Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/blockcopy8.S Fri Feb 26 15:30:16 2016 +0530
@@ -134,3 +134,102 @@
bne loop_csp64
bx lr
endfunc
+
+// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
+function x265_blockcopy_ps_4x4_neon
+ lsl r1, #1
+.rept 2
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d1}, [r2], r3
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vst1.u16 {d2}, [r0], r1
+ vst1.u16 {d4}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ps_8x8_neon
+ lsl r1, #1
+.rept 4
+ vld1.u8 {d0}, [r2], r3
+ vld1.u8 {d1}, [r2], r3
+ vmovl.u8 q1, d0
+ vmovl.u8 q2, d1
+ vst1.u16 {q1}, [r0], r1
+ vst1.u16 {q2}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ps_16x16_neon
+ lsl r1, #1
+.rept 8
+ vld1.u8 {q0}, [r2], r3
+ vld1.u8 {q1}, [r2], r3
+ vmovl.u8 q8, d0
+ vmovl.u8 q9, d1
+ vmovl.u8 q10, d2
+ vmovl.u8 q11, d3
+ vst1.u16 {q8, q9}, [r0], r1
+ vst1.u16 {q10, q11}, [r0], r1
+.endr
+ bx lr
+endfunc
+
+function x265_blockcopy_ps_32x32_neon
+ lsl r1, #1
+ sub r1, #32
+ mov r12, #4
+loop_cps32:
+ subs r12, #1
+.rept 4
+ vld1.u8 {q0, q1}, [r2], r3
+ vld1.u8 {q2, q3}, [r2], r3
+ vmovl.u8 q8, d0
+ vmovl.u8 q9, d1
+ vmovl.u8 q10, d2
+ vmovl.u8 q11, d3
+
+ vmovl.u8 q12, d4
+ vmovl.u8 q13, d5
+ vmovl.u8 q14, d6
+ vmovl.u8 q15, d7
+
+ vst1.u16 {q8, q9}, [r0]!
+ vst1.u16 {q10, q11}, [r0], r1
+ vst1.u16 {q12, q13}, [r0]!
+ vst1.u16 {q14, q15}, [r0], r1
+.endr
+ bne loop_cps32
+ bx lr
+endfunc
+
+function x265_blockcopy_ps_64x64_neon
+ lsl r1, #1
+ sub r1, #96
+ sub r3, #32
+ mov r12, #16
+loop_cps64:
+ subs r12, #1
+.rept 4
+ vld1.u8 {q0, q1}, [r2]!
+ vld1.u8 {q2, q3}, [r2], r3
+ vmovl.u8 q8, d0
+ vmovl.u8 q9, d1
+ vmovl.u8 q10, d2
+ vmovl.u8 q11, d3
+
+ vmovl.u8 q12, d4
+ vmovl.u8 q13, d5
+ vmovl.u8 q14, d6
+ vmovl.u8 q15, d7
+
+ vst1.u16 {q8, q9}, [r0]!
+ vst1.u16 {q10, q11}, [r0]!
+ vst1.u16 {q12, q13}, [r0]!
+ vst1.u16 {q14, q15}, [r0], r1
+.endr
+ bne loop_cps64
+ bx lr
+endfunc
diff -r e7fe95178598 -r cec0be1e488f source/common/arm/blockcopy8.h
--- a/source/common/arm/blockcopy8.h Fri Feb 26 14:56:46 2016 +0530
+++ b/source/common/arm/blockcopy8.h Fri Feb 26 15:30:16 2016 +0530
@@ -62,4 +62,10 @@
void x265_blockcopy_sp_16x16_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
void x265_blockcopy_sp_32x32_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
void x265_blockcopy_sp_64x64_neon(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb);
+
+void x265_blockcopy_ps_4x4_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_8x8_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_16x16_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_32x32_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
+void x265_blockcopy_ps_64x64_neon(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb);
#endif // ifndef X265_I386_PIXEL_ARM_H
More information about the x265-devel
mailing list