[x264-devel] [PATCH 1/2] arm64: Add asm for mbtree fixed point conversion

Sun Apr 24 14:38:55 CEST 2016

pack is ~7 times faster and unpack is ~9 times faster on a cortex-a53
compared to gcc-5.3.

mbtree_fix8_pack_c: 41534
mbtree_fix8_pack_neon: 5766
mbtree_fix8_unpack_c: 44102
mbtree_fix8_unpack_neon: 4868
---
 common/aarch64/mc-a.S | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/mc-c.c |  5 +++++
 2 files changed, 62 insertions(+)

diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 232ed3f..915d8c0 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1667,3 +1667,60 @@ function x264_memzero_aligned_neon, export=1
     b.gt        1b
     ret
 endfunc
+
+// void mbtree_fix8_pack( int16_t *dst, float *src, int count )
+function x264_mbtree_fix8_pack_neon, export=1
+    subs        w3,  w2,  #8
+    b.lt        2f
+1:
+    subs        w3,  w3,  #8
+    ld1         {v0.4s,v1.4s}, [x1], #32
+    fcvtzs      v0.4s,  v0.4s,  #8
+    fcvtzs      v1.4s,  v1.4s,  #8
+    sqxtn       v2.4h,  v0.4s
+    sqxtn2      v2.8h,  v1.4s
+    rev16       v3.16b, v2.16b
+    st1         {v3.8h},  [x0], #16
+    b.ge        1b
+2:
+    adds        w3,  w3,  #8
+    b.eq        4f
+3:
+    subs        w3,  w3,  #1
+    ldr         s0, [x1], #4
+    fcvtzs      w4,  s0,  #8
+    rev16       w5,  w4
+    strh        w5, [x0], #2
+    b.gt        3b
+4:
+    ret
+endfunc
+
+// void mbtree_fix8_unpack( float *dst, int16_t *src, int count )
+function x264_mbtree_fix8_unpack_neon, export=1
+    subs        w3,  w2,  #8
+    b.lt        2f
+1:
+    subs        w3,  w3,  #8
+    ld1         {v0.8h}, [x1], #16
+    rev16       v1.16b, v0.16b
+    sxtl        v2.4s,  v1.4h
+    sxtl2       v3.4s,  v1.8h
+    scvtf       v4.4s,  v2.4s,  #8
+    scvtf       v5.4s,  v3.4s,  #8
+    st1         {v4.4s,v5.4s}, [x0], #32
+    b.ge        1b
+2:
+    adds        w3,  w3,  #8
+    b.eq        4f
+3:
+    subs        w3,  w3,  #1
+    ldrh        w4, [x1], #2
+    rev16       w5,  w4
+    sxth        w6,  w5
+    scvtf       s0,  w6,  #8
+    str         s0, [x0], #4
+    b.gt        3b
+4:
+    ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 144404c..717820f 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -100,6 +100,9 @@ void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t
 
 void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 
+void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -262,6 +265,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
+    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
+    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
 
     pf->memcpy_aligned  = x264_memcpy_aligned_neon;
     pf->memzero_aligned = x264_memzero_aligned_neon;
-- 
2.8.1