[x264-devel] aarch64: x264_mbtree_propagate_{cost,list}_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:48 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Wed Oct 29 18:17:48 2014 +0100| [8d655b63b4f7bc021ad038ea64b7c4de9d0ef74b] | committer: Anton Mitrofanov
aarch64: x264_mbtree_propagate_{cost,list}_neon
x264_mbtree_propagate_cost_neon is ~7 times faster.
x264_mbtree_propagate_list_neon is 33% faster.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8d655b63b4f7bc021ad038ea64b7c4de9d0ef74b
---
common/aarch64/mc-a.S | 117 +++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/mc-c.c | 88 +++++++++++++++++++++++++++++++++++++
2 files changed, 205 insertions(+)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 83652f2..8407451 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1484,3 +1484,120 @@ function integral_init8v_neon, export=1
2:
ret
endfunc
+
+function x264_mbtree_propagate_cost_neon, export=1
+ ld1r {v5.4s}, [x5]
+8:
+ subs w6, w6, #8
+ ld1 {v1.8h}, [x1], #16
+ ld1 {v2.8h}, [x2], #16
+ ld1 {v3.8h}, [x3], #16
+ ld1 {v4.8h}, [x4], #16
+ bic v3.8h, #0xc0, lsl #8
+ umin v3.8h, v2.8h, v3.8h
+ umull v20.4s, v2.4h, v4.4h // propagate_intra
+ umull2 v21.4s, v2.8h, v4.8h // propagate_intra
+ usubl v22.4s, v2.4h, v3.4h // propagate_num
+ usubl2 v23.4s, v2.8h, v3.8h // propagate_num
+ uxtl v26.4s, v2.4h // propagate_denom
+ uxtl2 v27.4s, v2.8h // propagate_denom
+ uxtl v24.4s, v1.4h
+ uxtl2 v25.4s, v1.8h
+ ucvtf v20.4s, v20.4s
+ ucvtf v21.4s, v21.4s
+ ucvtf v26.4s, v26.4s
+ ucvtf v27.4s, v27.4s
+ ucvtf v22.4s, v22.4s
+ ucvtf v23.4s, v23.4s
+ frecpe v28.4s, v26.4s
+ frecpe v29.4s, v27.4s
+ ucvtf v24.4s, v24.4s
+ ucvtf v25.4s, v25.4s
+ frecps v30.4s, v28.4s, v26.4s
+ frecps v31.4s, v29.4s, v27.4s
+ fmla v24.4s, v20.4s, v5.4s // propagate_amount
+ fmla v25.4s, v21.4s, v5.4s // propagate_amount
+ fmul v28.4s, v28.4s, v30.4s
+ fmul v29.4s, v29.4s, v31.4s
+ fmul v16.4s, v24.4s, v22.4s
+ fmul v17.4s, v25.4s, v23.4s
+ fmul v18.4s, v16.4s, v28.4s
+ fmul v19.4s, v17.4s, v29.4s
+ fcvtns v20.4s, v18.4s
+ fcvtns v21.4s, v19.4s
+ sqxtn v0.4h, v20.4s
+ sqxtn2 v0.8h, v21.4s
+ st1 {v0.8h}, [x0], #16
+ b.ge 8b
+ ret
+endfunc
+
+const pw_0to15, align=5
+ .short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+endconst
+
+function x264_mbtree_propagate_list_internal_neon, export=1
+ movrel x11, pw_0to15
+ dup v31.8h, w4 // bipred_weight
+ movi v30.8h, #0xc0, lsl #8
+ ld1 {v29.8h}, [x11] //h->mb.i_mb_x,h->mb.i_mb_y
+ movi v28.4s, #4//, lsl #16
+ movi v27.8h, #31
+ movi v26.8h, #32
+ dup v24.8h, w5 // mb_y
+ zip1 v29.8h, v29.8h, v24.8h
+8:
+ subs w6, w6, #8
+ ld1 {v1.8h}, [x1], #16 // propagate_amount
+ ld1 {v2.8h}, [x2], #16 // lowres_cost
+ and v2.16b, v2.16b, v30.16b
+ cmeq v25.8h, v2.8h, v30.8h
+ umull v16.4s, v1.4h, v31.4h
+ umull2 v17.4s, v1.8h, v31.8h
+ rshrn v16.4h, v16.4s, #6
+ rshrn2 v16.8h, v17.4s, #6
+ bsl v25.16b, v16.16b, v1.16b // if( lists_used == 3 )
+ // propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+ ld1 {v4.8h,v5.8h}, [x0], #32
+ sshr v6.8h, v4.8h, #5
+ sshr v7.8h, v5.8h, #5
+ add v6.8h, v6.8h, v29.8h
+ add v29.8h, v29.8h, v28.8h
+ add v7.8h, v7.8h, v29.8h
+ add v29.8h, v29.8h, v28.8h
+ st1 {v6.8h,v7.8h}, [x3], #32
+ and v4.16b, v4.16b, v27.16b
+ and v5.16b, v5.16b, v27.16b
+ uzp1 v6.8h, v4.8h, v5.8h // x & 31
+ uzp2 v7.8h, v4.8h, v5.8h // y & 31
+ sub v4.8h, v26.8h, v6.8h // 32 - (x & 31)
+ sub v5.8h, v26.8h, v7.8h // 32 - (y & 31)
+ mul v19.8h, v6.8h, v7.8h // idx3weight = y*x;
+ mul v18.8h, v4.8h, v7.8h // idx2weight = y*(32-x);
+ mul v17.8h, v6.8h, v5.8h // idx1weight = (32-y)*x;
+ mul v16.8h, v4.8h, v5.8h // idx0weight = (32-y)*(32-x) ;
+ umull v6.4s, v19.4h, v25.4h
+ umull2 v7.4s, v19.8h, v25.8h
+ umull v4.4s, v18.4h, v25.4h
+ umull2 v5.4s, v18.8h, v25.8h
+ umull v2.4s, v17.4h, v25.4h
+ umull2 v3.4s, v17.8h, v25.8h
+ umull v0.4s, v16.4h, v25.4h
+ umull2 v1.4s, v16.8h, v25.8h
+ rshrn v19.4h, v6.4s, #10
+ rshrn2 v19.8h, v7.4s, #10
+ rshrn v18.4h, v4.4s, #10
+ rshrn2 v18.8h, v5.4s, #10
+ rshrn v17.4h, v2.4s, #10
+ rshrn2 v17.8h, v3.4s, #10
+ rshrn v16.4h, v0.4s, #10
+ rshrn2 v16.8h, v1.4s, #10
+ zip1 v0.8h, v16.8h, v17.8h
+ zip2 v1.8h, v16.8h, v17.8h
+ zip1 v2.8h, v18.8h, v19.8h
+ zip2 v3.8h, v18.8h, v19.8h
+ st1 {v0.8h,v1.8h}, [x3], #32
+ st1 {v2.8h,v3.8h}, [x3], #32
+ b.ge 8b
+ ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index f40fed6..96582d4 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -96,6 +96,8 @@ void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
void integral_init8v_neon( uint16_t *, intptr_t );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
@@ -201,6 +203,89 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int height, int16_t *buf );
#endif // !HIGH_BIT_DEPTH
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ CLIP_ADD((s)[0], (x)[0]);\
+ CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+ int16_t *propagate_amount,
+ uint16_t *lowres_costs,
+ int16_t *output,
+ int bipred_weight, int mb_y,
+ int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+ int16_t (*mvs)[2],
+ int16_t *propagate_amount,
+ uint16_t *lowres_costs,
+ int bipred_weight, int mb_y,
+ int len, int list )
+{
+ int16_t *current = h->scratch_buffer2;
+
+ x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+ lowres_costs, current,
+ bipred_weight, mb_y, len );
+
+ unsigned stride = h->mb.i_mb_stride;
+ unsigned width = h->mb.i_mb_width;
+ unsigned height = h->mb.i_mb_height;
+
+ for( unsigned i = 0; i < len; current += 32 )
+ {
+ int end = X264_MIN( i+8, len );
+ for( ; i < end; i++, current += 2 )
+ {
+ if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+ continue;
+
+ unsigned mbx = current[0];
+ unsigned mby = current[1];
+ unsigned idx0 = mbx + mby * stride;
+ unsigned idx2 = idx0 + stride;
+
+ /* Shortcut for the simple/common case of zero MV */
+ if( !M32( mvs[i] ) )
+ {
+ CLIP_ADD( ref_costs[idx0], current[16] );
+ continue;
+ }
+
+ if( mbx < width-1 && mby < height-1 )
+ {
+ CLIP_ADD2( ref_costs+idx0, current+16 );
+ CLIP_ADD2( ref_costs+idx2, current+32 );
+ }
+ else
+ {
+ /* Note: this takes advantage of unsigned representation to
+ * catch negative mbx/mby. */
+ if( mby < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx0+0], current[16] );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx0+1], current[17] );
+ }
+ if( mby+1 < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx2+0], current[32] );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx2+1], current[33] );
+ }
+ }
+ }
+ }
+}
+
+#undef CLIP_ADD
+#undef CLIP_ADD2
+
void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
{
#if !HIGH_BIT_DEPTH
@@ -252,5 +337,8 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = integral_init8h_neon;
pf->integral_init4v = integral_init4v_neon;
pf->integral_init8v = integral_init8v_neon;
+
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
#endif // !HIGH_BIT_DEPTH
}
More information about the x264-devel
mailing list