[x264-devel] [PATCH 2/2] arm: Add asm for mbtree fixed point conversion
Janne Grunau
janne-x264 at jannau.net
Sun Apr 24 14:38:56 CEST 2016
7-8 times faster on a cortex-a53 vs. gcc-5.3.
mbtree_fix8_pack_c: 44114
mbtree_fix8_pack_neon: 5805
mbtree_fix8_unpack_c: 38924
mbtree_fix8_unpack_neon: 4870
---
common/arm/mc-a.S | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/mc-c.c | 5 +++++
2 files changed, 62 insertions(+)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 1dbd498..76295cd 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1880,3 +1880,60 @@ function x264_mbtree_propagate_list_internal_neon
bge 8b
bx lr
endfunc
+
+@ void mbtree_fix8_pack( int16_t *dst, float *src, int count )
+function x264_mbtree_fix8_pack_neon, export=1
+ subs r3, r2, #8
+ blt 2f
+1:
+ subs r3, r3, #8
+ vld1.32 {q0,q1}, [r1,:128]!
+ vcvt.s32.f32 q0, q0, #8
+ vcvt.s32.f32 q1, q1, #8
+ vqmovn.s32 d4, q0
+ vqmovn.s32 d5, q1
+ vrev16.8 q3, q2
+ vst1.16 {q3}, [r0,:128]!
+ bge 1b
+2:
+ adds r3, r3, #8
+ bxeq lr
+3:
+ subs r3, r3, #1
+ vld1.32 {d0[0]}, [r1]!
+ vcvt.s32.f32 s0, s0, #8
+ vrev16.8 d0, d0
+ vst1.16 {d0[0]}, [r0]!
+ bgt 3b
+
+ bx lr
+endfunc
+
+@ void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
+function x264_mbtree_fix8_unpack_neon, export=1
+ subs r3, r2, #8
+ blt 2f
+1:
+ subs r3, r3, #8
+ vld1.16 {q0}, [r1,:128]!
+ vrev16.8 q1, q0
+ vmovl.s16 q0, d2
+ vmovl.s16 q1, d3
+ vcvt.f32.s32 q0, q0, #8
+ vcvt.f32.s32 q1, q1, #8
+ vst1.32 {q0,q1}, [r0,:128]!
+ bge 1b
+2:
+ adds r3, r3, #8
+ bxeq lr
+3:
+ subs r3, r3, #1
+ vld1.16 {d0[0]}, [r1]!
+ vrev16.8 d0, d0
+ vmovl.s16 q0, d0
+ vcvt.f32.s32 d0, d0, #8
+ vst1.32 {d0[0]}, [r0]!
+ bgt 3b
+
+ bx lr
+endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index a2ab9a3..d330bc3 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -109,6 +109,9 @@ void integral_init8v_neon( uint16_t *, intptr_t );
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
@@ -291,6 +294,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
--
2.8.1
More information about the x264-devel
mailing list