[x264-devel] [PATCH 1/3] arm: Implement x264_plane_copy_neon
Martin Storsjö
martin at martin.st
Thu Aug 27 23:15:01 CEST 2015
checkasm timing Cortex-A7 A8 A9
plane_copy_c 13124 10925 9106
plane_copy_neon 7349 5103 8945
---
Use bic instead of and, use lr instead of r5, return using
pop {..,pc}. Settled on using two separate ldr calls instead of
ldrd, both since it's required when loading r5+lr, and since it
seemed much faster on A8.
---
common/arm/mc-a.S | 32 ++++++++++++++++++++++++++++++++
common/arm/mc-c.c | 3 +++
2 files changed, 35 insertions(+)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 36ce86f..5e0c117 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -6,6 +6,7 @@
* Authors: David Conrad <lessen42 at gmail.com>
* Mans Rullgard <mans at mansr.com>
* Stefan Groenroos <stefan.gronroos at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -1461,6 +1462,37 @@ function x264_load_deinterleave_chroma_fenc_neon
bx lr
endfunc
+function x264_plane_copy_neon
+ push {r4,lr}
+ ldr r4, [sp, #8]
+ ldr lr, [sp, #12]
+ add r12, r4, #15
+ bic r4, r12, #15
+ sub r1, r1, r4
+ sub r3, r3, r4
+1:
+ mov r12, r4
+16:
+ tst r12, #16
+ beq 32f
+ subs r12, r12, #16
+ vld1.8 {q0}, [r2]!
+ vst1.8 {q0}, [r0]!
+ beq 0f
+32:
+ subs r12, r12, #32
+ vld1.8 {q0, q1}, [r2]!
+ vst1.8 {q0, q1}, [r0]!
+ bgt 32b
+0:
+ subs lr, lr, #1
+ add r2, r2, r3
+ add r0, r0, r1
+ bgt 1b
+
+ pop {r4,pc}
+endfunc
+
function x264_plane_copy_deinterleave_neon
push {r4-r7, lr}
ldrd r6, r7, [sp, #28]
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 3fa18ec..dd86fb2 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -47,6 +47,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+ pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
@@ -244,6 +246,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+ pf->plane_copy = x264_plane_copy_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
--
1.7.10.4
More information about the x264-devel
mailing list