[x264-devel] [PATCH] x264_mbtree_propagate_cost_neon

Thu Feb 9 01:08:31 CET 2012

---
 common/arm/mc-a.S |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/mc-c.c |    2 ++
 2 files changed, 51 insertions(+), 0 deletions(-)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 07d6436..ea93b78 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1393,6 +1393,55 @@ function x264_load_deinterleave_chroma_\stride\()_neon
 LOAD_DEINTERLEAVE_CHROMA fenc
 LOAD_DEINTERLEAVE_CHROMA fdec
 
+//static void mbtree_propagate_cost(
+            //int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+            //uint16_t *inter_costs, uint16_t *inv_qscales,
+            //float *fps_factor, int len )
+
+function x264_mbtree_propagate_cost_neon
+    push            {r4-r6,lr}
+    ldrd            r4, r5, [sp, #16]
+    ldr             r6, [sp, #24]
+    vld1.32         {d0[0]}, [r5] //fps_factor
+    vdup.32         q0, d0[0] //fps
+1:  //intra_costs[i], inv_qscales[i], propagate_in[i], inter_costs
+    vld1.16         {d4}, [r1]! //propagate_in
+    vld1.16         {d5}, [r2]! //intra_costs
+    vld1.16         {d6}, [r3]! //inter_costs
+    vld1.16         {d7}, [r4]! //inv_qscales
+    vbic.i16        d6, #0xC000
+
+    vmovl.u16       q8, d4
+    vmovl.u16       q9, d5
+    vmovl.u16       q10, d6
+    vmovl.u16       q11, d7
+
+    vcvt.f32.u32    q8, q8
+    vcvt.f32.u32    q9, q9
+    vcvt.f32.u32    q10, q10
+    vcvt.f32.u32    q11, q11, #8
+
+    vmul.f32        q11, q9 //intra_cost*inv_qscales
+    vmul.f32        q11, q0 //intra_cost*fps
+    vadd.f32        q11, q8 //propagate_amount
+
+    vsub.f32        q2, q9, q10 //propagate_num
+    vmul.f32        q2, q11, q2
+
+    vrecpe.f32      q3, q9
+    vrecps.f32      q9, q3, q9
+    vmul.f32        q3, q9
+    vmul.f32        q2, q3
+
+    subs            r6, #4
+
+    vcvt.s32.f32    q2, q2
+    vst1.32         {d4, d5}, [r0]!
+    bgt             1b
+
+    pop             {r4-r6,pc}
+.endfunc
+
 //void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
                                    //pixel *srcu, int i_srcu,
                                    //pixel *srcv, int i_srcv, int w, int h )
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 55f0092..dc8607b 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -74,6 +74,7 @@ void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
 
 void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+void x264_mbtree_propagate_cost_neon( int *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 
 void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
 void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
@@ -270,6 +271,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
     pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
     pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
     pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
 #endif // !HIGH_BIT_DEPTH
-- 
1.7.4.1