[x264-devel] [PATCH] x264_mbtree_propagate_cost_neon
George Stephanos
gaf.stephanos at gmail.com
Thu Feb 9 01:08:31 CET 2012
---
common/arm/mc-a.S | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/mc-c.c | 2 ++
2 files changed, 51 insertions(+), 0 deletions(-)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 07d6436..ea93b78 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1393,6 +1393,55 @@ function x264_load_deinterleave_chroma_\stride\()_neon
LOAD_DEINTERLEAVE_CHROMA fenc
LOAD_DEINTERLEAVE_CHROMA fdec
+//static void mbtree_propagate_cost(
+ //int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+ //uint16_t *inter_costs, uint16_t *inv_qscales,
+ //float *fps_factor, int len )
+
+function x264_mbtree_propagate_cost_neon
+ push {r4-r6,lr}
+ ldrd r4, r5, [sp, #16]
+ ldr r6, [sp, #24]
+ vld1.32 {d0[0]}, [r5] //fps_factor
+ vdup.32 q0, d0[0] //fps
+1: //intra_costs[i], inv_qscales[i], propagate_in[i], inter_costs
+ vld1.16 {d4}, [r1]! //propagate_in
+ vld1.16 {d5}, [r2]! //intra_costs
+ vld1.16 {d6}, [r3]! //inter_costs
+ vld1.16 {d7}, [r4]! //inv_qscales
+ vbic.i16 d6, #0xC000
+
+ vmovl.u16 q8, d4
+ vmovl.u16 q9, d5
+ vmovl.u16 q10, d6
+ vmovl.u16 q11, d7
+
+ vcvt.f32.u32 q8, q8
+ vcvt.f32.u32 q9, q9
+ vcvt.f32.u32 q10, q10
+ vcvt.f32.u32 q11, q11, #8
+
+ vmul.f32 q11, q9 //intra_cost*inv_qscales
+ vmul.f32 q11, q0 //intra_cost*fps
+ vadd.f32 q11, q8 //propagate_amount
+
+ vsub.f32 q2, q9, q10 //propagate_num
+ vmul.f32 q2, q11, q2
+
+ vrecpe.f32 q3, q9
+ vrecps.f32 q9, q3, q9
+ vmul.f32 q3, q9
+ vmul.f32 q2, q3
+
+ subs r6, #4
+
+ vcvt.s32.f32 q2, q2
+ vst1.32 {d4, d5}, [r0]!
+ bgt 1b
+
+ pop {r4-r6,pc}
+.endfunc
+
//void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
//pixel *srcu, int i_srcu,
//pixel *srcv, int i_srcv, int w, int h )
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 55f0092..dc8607b 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -74,6 +74,7 @@ void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+void x264_mbtree_propagate_cost_neon( int *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
@@ -270,6 +271,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
#endif // !HIGH_BIT_DEPTH
--
1.7.4.1
More information about the x264-devel
mailing list