[x264-devel] aarch64: {plane_copy, memcpy_aligned, memzero_aligned}_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:48 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Fri Oct 31 14:49:04 2014 +0100| [f13573e490d9f18bbcb10409fb09ec25e477035e] | committer: Anton Mitrofanov
aarch64: {plane_copy,memcpy_aligned,memzero_aligned}_neon
2-3 times faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f13573e490d9f18bbcb10409fb09ec25e477035e
---
common/aarch64/mc-a.S | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/mc-c.c | 6 +++++
2 files changed, 72 insertions(+)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 8407451..324ef16 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1253,6 +1253,34 @@ load_deinterleave_chroma:
ret
endfunc
+function x264_plane_copy_neon, export=1
+ add x8, x4, #15
+ and x4, x8, #~15
+ sub x1, x1, x4
+ sub x3, x3, x4
+1:
+ mov w8, w4
+16:
+ tst w8, #16
+ b.eq 32f
+ subs w8, w8, #16
+ ldr q0, [x2], #16
+ str q0, [x0], #16
+ b.eq 0f
+32:
+ subs w8, w8, #32
+ ldp q0, q1, [x2], #32
+ stp q0, q1, [x0], #32
+ b.gt 32b
+0:
+ subs w5, w5, #1
+ add x2, x2, x3
+ add x0, x0, x1
+ b.gt 1b
+
+ ret
+endfunc
+
function x264_plane_copy_deinterleave_neon, export=1
add w9, w6, #15
and w9, w9, #0xfffffff0
@@ -1601,3 +1629,41 @@ function x264_mbtree_propagate_list_internal_neon, export=1
b.ge 8b
ret
endfunc
+
+function x264_memcpy_aligned_neon, export=1
+ tst x2, #16
+ b.eq 32f
+ sub x2, x2, #16
+ ldr q0, [x1], #16
+ str q0, [x0], #16
+32:
+ tst x2, #32
+ b.eq 640f
+ sub x2, x2, #32
+ ldp q0, q1, [x1], #32
+ stp q0, q1, [x0], #32
+640:
+ cbz x2, 1f
+64:
+ subs x2, x2, #64
+ ldp q0, q1, [x1, #32]
+ ldp q2, q3, [x1], #64
+ stp q0, q1, [x0, #32]
+ stp q2, q3, [x0], #64
+ b.gt 64b
+1:
+ ret
+endfunc
+
+function x264_memzero_aligned_neon, export=1
+ movi v0.16b, #0
+ movi v1.16b, #0
+1:
+ subs x1, x1, #128
+ stp q0, q1, [x0, #96]
+ stp q0, q1, [x0, #64]
+ stp q0, q1, [x0, #32]
+ stp q0, q1, [x0], 128
+ b.gt 1b
+ ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 96582d4..25ebea4 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -49,6 +49,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+ pixel *src, intptr_t i_src, int w, int h );
void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
pixel *dstv, intptr_t i_dstv,
pixel *src, intptr_t i_src, int w, int h );
@@ -304,6 +306,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+ pf->plane_copy = x264_plane_copy_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
@@ -340,5 +343,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+
+ pf->memcpy_aligned = x264_memcpy_aligned_neon;
+ pf->memzero_aligned = x264_memzero_aligned_neon;
#endif // !HIGH_BIT_DEPTH
}
More information about the x264-devel
mailing list