[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon
Martin Storsjö
martin at martin.st
Thu Aug 13 22:59:43 CEST 2015
The cost function could be simplified to avoid having to clobber
q4/q5, but this requires reordering instructions which increase
the total runtime.
The list function could avoid pushing q4-q7 to the stack by
reusing registers and reloading the constants, but that also
seems to actually make it slightly slower in practice.
checkasm timing Cortex-A7 A8 A9
mbtree_propagate_cost_c 63584 156719 62616
mbtree_propagate_cost_neon 17316 10833 11290
mbtree_propagate_list_c 110894 108411 84324
mbtree_propagate_list_neon 83313 78821 62271
---
common/arm/mc-a.S | 128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/mc-c.c | 86 +++++++++++++++++++++++++++++++++++
2 files changed, 214 insertions(+)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 6538dec..beae5bd 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -28,6 +28,11 @@
#include "asm.S"
+.section .rodata
+.align 4
+pw_0to15:
+.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
.text
// note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
@@ -1760,3 +1765,126 @@ function integral_init8v_neon
2:
bx lr
endfunc
+
+function x264_mbtree_propagate_cost_neon
+ vpush {q4-q5}
+ push {r4-r6}
+ ldrd r4, r5, [sp, #44]
+ ldr r6, [sp, #52]
+ vld1.32 {d6[]}, [r5]
+ vmov d7, d6
+8:
+ subs r6, r6, #8
+ vld1.16 {q8}, [r1]!
+ vld1.16 {q9}, [r2]!
+ vld1.16 {q10}, [r3]!
+ vld1.16 {q11}, [r4]!
+ vbic.u16 q10, q10, #0xc000
+ vmin.u16 q10, q9, q10
+ vmull.u16 q12, d18, d22 @ propagate_intra
+ vmull.u16 q13, d19, d23 @ propagate_intra
+ vsubl.u16 q14, d18, d20 @ propagate_num
+ vsubl.u16 q15, d19, d21 @ propagate_num
+ vmovl.u16 q10, d18 @ propagate_denom
+ vmovl.u16 q11, d19 @ propagate_denom
+ vmovl.u16 q9, d17
+ vmovl.u16 q8, d16
+ vcvt.f32.s32 q12, q12
+ vcvt.f32.s32 q13, q13
+ vcvt.f32.s32 q14, q14
+ vcvt.f32.s32 q15, q15
+ vcvt.f32.s32 q10, q10
+ vcvt.f32.s32 q11, q11
+ vrecpe.f32 q0, q10
+ vrecpe.f32 q1, q11
+ vcvt.f32.s32 q8, q8
+ vcvt.f32.s32 q9, q9
+ vrecps.f32 q4, q0, q10
+ vrecps.f32 q5, q1, q11
+ vmla.f32 q8, q12, q3 @ propagate_amount
+ vmla.f32 q9, q13, q3 @ propagate_amount
+ vmul.f32 q0, q0, q4
+ vmul.f32 q1, q1, q5
+ vmul.f32 q8, q8, q14
+ vmul.f32 q9, q9, q15
+ vmul.f32 q0, q8, q0
+ vmul.f32 q1, q9, q1
+ vcvt.s32.f32 q0, q0
+ vcvt.s32.f32 q1, q1
+ vqmovn.s32 d0, q0
+ vqmovn.s32 d1, q1
+ vst1.16 {q0}, [r0]!
+ bge 8b
+ pop {r4-r6}
+ vpop {q4-q5}
+ bx lr
+endfunc
+
+function x264_mbtree_propagate_list_internal_neon
+ vpush {q4-q7}
+ push {r4-r6}
+ ldrd r4, r5, [sp, #76]
+ ldr r6, [sp, #84]
+ movrel r12, pw_0to15
+ vdup.16 d12, r4 @ bipred_weight
+ vmov.u16 q4, #0xc000
+ vld1.16 {q0}, [r12, :128] @h->mb.i_mb_x,h->mb.i_mb_y
+ vmov.u32 q5, #4
+ vmov.u16 q2, #31
+ vmov.u16 q3, #32
+ vdup.u16 q8, r5 @ mb_y
+ vzip.u16 q0, q8
+8:
+ subs r6, r6, #8
+ vld1.16 {q8}, [r1, :128]! @ propagate_amount
+ vld1.16 {q9}, [r2, :128]! @ lowres_cost
+ vand q9, q9, q4
+ vceq.u16 q1, q9, q4
+ vmull.u16 q10, d16, d12
+ vmull.u16 q11, d17, d12
+ vrshrn.u32 d20, q10, #6
+ vrshrn.u32 d21, q11, #6
+ vbsl q1, q10, q8 @ if( lists_used == 3 )
+ @ propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+ vld1.16 {q8, q9}, [r0, :128]!
+ vshr.s16 q10, q8, #5
+ vshr.s16 q11, q9, #5
+ vadd.s16 q10, q10, q0
+ vadd.s16 q0, q0, q5
+ vadd.s16 q11, q11, q0
+ vadd.s16 q0, q0, q5
+ vst1.16 {q10, q11}, [r3, :128]!
+ vand q8, q8, q2
+ vand q9, q9, q2
+ vuzp.16 q8, q9 @ x & 31, y & 31
+ vsub.s16 q10, q3, q8 @ 32 - (x & 31)
+ vsub.s16 q11, q3, q9 @ 32 - (y & 31)
+ vmul.u16 q12, q8, q9 @ idx3weight = y*x
+ vmul.u16 q13, q10, q9 @ idx2weight = y*(32-x)
+ vmul.u16 q14, q8, q11 @ idx1weight = (32-y)*x
+ vmul.u16 q15, q10, q11 @ idx0weight = (32-y)*(32-x)
+ vmull.u16 q8, d24, d2 @ idx3weight
+ vmull.u16 q9, d25, d3
+ vmull.u16 q10, d26, d2 @ idx2weight
+ vmull.u16 q11, d27, d3
+ vmull.u16 q12, d28, d2 @ idx1weight
+ vmull.u16 q13, d29, d3
+ vmull.u16 q14, d30, d2 @ idx0weight
+ vmull.u16 q15, d31, d3
+ vrshrn.u32 d19, q9, #10 @ idx3weight
+ vrshrn.u32 d18, q8, #10
+ vrshrn.u32 d16, q10, #10 @ idx2weight
+ vrshrn.u32 d17, q11, #10
+ vrshrn.u32 d22, q12, #10 @ idx1weight
+ vrshrn.u32 d23, q13, #10
+ vrshrn.u32 d20, q14, #10 @ idx0weight
+ vrshrn.u32 d21, q15, #10
+ vzip.16 q10, q11
+ vzip.16 q8, q9
+ vst1.16 {q10, q11}, [r3, :128]!
+ vst1.16 {q8, q9}, [r3, :128]!
+ bge 8b
+ pop {r4-r6}
+ vpop {q4-q7}
+ bx lr
+endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index dd86fb2..eb582c2 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2015 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
void integral_init8v_neon( uint16_t *, intptr_t );
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
@@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
}
#endif // !HIGH_BIT_DEPTH
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ CLIP_ADD((s)[0], (x)[0]);\
+ CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+ int16_t *propagate_amount,
+ uint16_t *lowres_costs,
+ int16_t *output,
+ int bipred_weight, int mb_y,
+ int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+ int16_t (*mvs)[2],
+ int16_t *propagate_amount,
+ uint16_t *lowres_costs,
+ int bipred_weight, int mb_y,
+ int len, int list )
+{
+ int16_t *current = h->scratch_buffer2;
+
+ x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+ lowres_costs, current,
+ bipred_weight, mb_y, len );
+
+ unsigned stride = h->mb.i_mb_stride;
+ unsigned width = h->mb.i_mb_width;
+ unsigned height = h->mb.i_mb_height;
+
+ for( unsigned i = 0; i < len; current += 32 )
+ {
+ int end = X264_MIN( i+8, len );
+ for( ; i < end; i++, current += 2 )
+ {
+ if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+ continue;
+
+ unsigned mbx = current[0];
+ unsigned mby = current[1];
+ unsigned idx0 = mbx + mby * stride;
+ unsigned idx2 = idx0 + stride;
+
+ /* Shortcut for the simple/common case of zero MV */
+ if( !M32( mvs[i] ) )
+ {
+ CLIP_ADD( ref_costs[idx0], current[16] );
+ continue;
+ }
+
+ if( mbx < width-1 && mby < height-1 )
+ {
+ CLIP_ADD2( ref_costs+idx0, current+16 );
+ CLIP_ADD2( ref_costs+idx2, current+32 );
+ }
+ else
+ {
+ /* Note: this takes advantage of unsigned representation to
+ * catch negative mbx/mby. */
+ if( mby < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx0+0], current[16] );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx0+1], current[17] );
+ }
+ if( mby+1 < height )
+ {
+ if( mbx < width )
+ CLIP_ADD( ref_costs[idx2+0], current[32] );
+ if( mbx+1 < width )
+ CLIP_ADD( ref_costs[idx2+1], current[33] );
+ }
+ }
+ }
+ }
+}
+
void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
{
if( !(cpu&X264_CPU_ARMV6) )
@@ -281,6 +364,9 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = integral_init8h_neon;
pf->integral_init4v = integral_init4v_neon;
pf->integral_init8v = integral_init8v_neon;
+
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+ pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
--
1.7.10.4
More information about the x264-devel
mailing list