[x264-devel] [PATCH 1/2] arm64: Add asm for mbtree fixed point conversion
Janne Grunau
janne-x264 at jannau.net
Sun Apr 24 14:38:55 CEST 2016
pack is ~7 times faster and unpack is ~9 times faster on a cortex-a53
compared to gcc-5.3.
mbtree_fix8_pack_c: 41534
mbtree_fix8_pack_neon: 5766
mbtree_fix8_unpack_c: 44102
mbtree_fix8_unpack_neon: 4868
---
common/aarch64/mc-a.S | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/mc-c.c | 5 +++++
2 files changed, 62 insertions(+)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 232ed3f..915d8c0 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1667,3 +1667,60 @@ function x264_memzero_aligned_neon, export=1
b.gt 1b
ret
endfunc
+
+// void mbtree_fix8_pack( int16_t *dst, float *src, int count )
+function x264_mbtree_fix8_pack_neon, export=1
+ subs w3, w2, #8
+ b.lt 2f
+1:
+ subs w3, w3, #8
+ ld1 {v0.4s,v1.4s}, [x1], #32
+ fcvtzs v0.4s, v0.4s, #8
+ fcvtzs v1.4s, v1.4s, #8
+ sqxtn v2.4h, v0.4s
+ sqxtn2 v2.8h, v1.4s
+ rev16 v3.16b, v2.16b
+ st1 {v3.8h}, [x0], #16
+ b.ge 1b
+2:
+ adds w3, w3, #8
+ b.eq 4f
+3:
+ subs w3, w3, #1
+ ldr s0, [x1], #4
+ fcvtzs w4, s0, #8
+ rev16 w5, w4
+ strh w5, [x0], #2
+ b.gt 3b
+4:
+ ret
+endfunc
+
+// void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
+function x264_mbtree_fix8_unpack_neon, export=1
+ subs w3, w2, #8
+ b.lt 2f
+1:
+ subs w3, w3, #8
+ ld1 {v0.8h}, [x1], #16
+ rev16 v1.16b, v0.16b
+ sxtl v2.4s, v1.4h
+ sxtl2 v3.4s, v1.8h
+ scvtf v4.4s, v2.4s, #8
+ scvtf v5.4s, v3.4s, #8
+ st1 {v4.4s,v5.4s}, [x0], #32
+ b.ge 1b
+2:
+ adds w3, w3, #8
+ b.eq 4f
+3:
+ subs w3, w3, #1
+ ldrh w4, [x1], #2
+ rev16 w5, w4
+ sxth w6, w5
+ scvtf s0, w6, #8
+ str s0, [x0], #4
+ b.gt 3b
+4:
+ ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 144404c..717820f 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -100,6 +100,9 @@ void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t
void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
@@ -262,6 +265,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
pf->memcpy_aligned = x264_memcpy_aligned_neon;
pf->memzero_aligned = x264_memzero_aligned_neon;
--
2.8.1
More information about the x264-devel
mailing list