[x264-devel] [PATCH 22/24] RFC: arm: Implement x264_mbtree_propagate_{cost, list}_neon

Thu Aug 13 22:59:43 CEST 2015

The cost function could be simplified to avoid having to clobber
q4/q5, but this requires reordering instructions which increase
the total runtime.

The list function could avoid pushing q4-q7 to the stack by
reusing registers and reloading the constants, but that also
seems to actually make it slightly slower in practice.

checkasm timing       Cortex-A7      A8      A9
mbtree_propagate_cost_c      63584   156719  62616
mbtree_propagate_cost_neon   17316   10833   11290

mbtree_propagate_list_c      110894  108411  84324
mbtree_propagate_list_neon   83313   78821   62271
---
 common/arm/mc-a.S |  128 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/mc-c.c |   86 +++++++++++++++++++++++++++++++++++
 2 files changed, 214 insertions(+)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 6538dec..beae5bd 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -28,6 +28,11 @@
 
 #include "asm.S"
 
+.section .rodata
+.align 4
+pw_0to15:
+.short 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+
 .text
 
 // note: prefetch stuff assumes 64-byte cacheline, true for the Cortex-A8
@@ -1760,3 +1765,126 @@ function integral_init8v_neon
 2:
     bx              lr
 endfunc
+
+function x264_mbtree_propagate_cost_neon
+    vpush           {q4-q5}
+    push            {r4-r6}
+    ldrd            r4, r5, [sp, #44]
+    ldr             r6, [sp, #52]
+    vld1.32         {d6[]},  [r5]
+    vmov            d7,  d6
+8:
+    subs            r6,  r6,  #8
+    vld1.16         {q8},  [r1]!
+    vld1.16         {q9},  [r2]!
+    vld1.16         {q10}, [r3]!
+    vld1.16         {q11}, [r4]!
+    vbic.u16        q10, q10, #0xc000
+    vmin.u16        q10, q9,  q10
+    vmull.u16       q12, d18, d22           @ propagate_intra
+    vmull.u16       q13, d19, d23           @ propagate_intra
+    vsubl.u16       q14, d18, d20           @ propagate_num
+    vsubl.u16       q15, d19, d21           @ propagate_num
+    vmovl.u16       q10, d18                @ propagate_denom
+    vmovl.u16       q11, d19                @ propagate_denom
+    vmovl.u16       q9,  d17
+    vmovl.u16       q8,  d16
+    vcvt.f32.s32    q12, q12
+    vcvt.f32.s32    q13, q13
+    vcvt.f32.s32    q14, q14
+    vcvt.f32.s32    q15, q15
+    vcvt.f32.s32    q10, q10
+    vcvt.f32.s32    q11, q11
+    vrecpe.f32      q0,  q10
+    vrecpe.f32      q1,  q11
+    vcvt.f32.s32    q8,  q8
+    vcvt.f32.s32    q9,  q9
+    vrecps.f32      q4,  q0,  q10
+    vrecps.f32      q5,  q1,  q11
+    vmla.f32        q8,  q12, q3            @ propagate_amount
+    vmla.f32        q9,  q13, q3            @ propagate_amount
+    vmul.f32        q0,  q0,  q4
+    vmul.f32        q1,  q1,  q5
+    vmul.f32        q8,  q8,  q14
+    vmul.f32        q9,  q9,  q15
+    vmul.f32        q0,  q8,  q0
+    vmul.f32        q1,  q9,  q1
+    vcvt.s32.f32    q0,  q0
+    vcvt.s32.f32    q1,  q1
+    vqmovn.s32      d0,  q0
+    vqmovn.s32      d1,  q1
+    vst1.16         {q0},  [r0]!
+    bge             8b
+    pop             {r4-r6}
+    vpop            {q4-q5}
+    bx              lr
+endfunc
+
+function x264_mbtree_propagate_list_internal_neon
+    vpush           {q4-q7}
+    push            {r4-r6}
+    ldrd            r4, r5, [sp, #76]
+    ldr             r6, [sp, #84]
+    movrel          r12, pw_0to15
+    vdup.16         d12, r4                 @ bipred_weight
+    vmov.u16        q4,  #0xc000
+    vld1.16         {q0},  [r12, :128]      @h->mb.i_mb_x,h->mb.i_mb_y
+    vmov.u32        q5,  #4
+    vmov.u16        q2,  #31
+    vmov.u16        q3,  #32
+    vdup.u16        q8,  r5                 @ mb_y
+    vzip.u16        q0,  q8
+8:
+    subs            r6,  r6,  #8
+    vld1.16         {q8},  [r1, :128]!      @ propagate_amount
+    vld1.16         {q9},  [r2, :128]!      @ lowres_cost
+    vand            q9,  q9,  q4
+    vceq.u16        q1,  q9,  q4
+    vmull.u16       q10, d16, d12
+    vmull.u16       q11, d17, d12
+    vrshrn.u32      d20, q10, #6
+    vrshrn.u32      d21, q11, #6
+    vbsl            q1,  q10, q8 @ if( lists_used == 3 )
+    @               propagate_amount = (propagate_amount * bipred_weight + 32) >> 6
+    vld1.16         {q8, q9},  [r0, :128]!
+    vshr.s16        q10, q8,  #5
+    vshr.s16        q11, q9,  #5
+    vadd.s16        q10, q10, q0
+    vadd.s16        q0,  q0,  q5
+    vadd.s16        q11, q11, q0
+    vadd.s16        q0,  q0,  q5
+    vst1.16         {q10, q11},  [r3, :128]!
+    vand            q8,  q8,  q2
+    vand            q9,  q9,  q2
+    vuzp.16         q8,  q9                 @ x & 31, y & 31
+    vsub.s16        q10, q3,  q8            @ 32 - (x & 31)
+    vsub.s16        q11, q3,  q9            @ 32 - (y & 31)
+    vmul.u16        q12, q8,  q9            @ idx3weight = y*x
+    vmul.u16        q13, q10, q9            @ idx2weight = y*(32-x)
+    vmul.u16        q14, q8,  q11           @ idx1weight = (32-y)*x
+    vmul.u16        q15, q10, q11           @ idx0weight = (32-y)*(32-x)
+    vmull.u16       q8,  d24, d2            @ idx3weight
+    vmull.u16       q9,  d25, d3
+    vmull.u16       q10, d26, d2            @ idx2weight
+    vmull.u16       q11, d27, d3
+    vmull.u16       q12, d28, d2            @ idx1weight
+    vmull.u16       q13, d29, d3
+    vmull.u16       q14, d30, d2            @ idx0weight
+    vmull.u16       q15, d31, d3
+    vrshrn.u32      d19, q9,  #10           @ idx3weight
+    vrshrn.u32      d18, q8,  #10
+    vrshrn.u32      d16, q10, #10           @ idx2weight
+    vrshrn.u32      d17, q11, #10
+    vrshrn.u32      d22, q12, #10           @ idx1weight
+    vrshrn.u32      d23, q13, #10
+    vrshrn.u32      d20, q14, #10           @ idx0weight
+    vrshrn.u32      d21, q15, #10
+    vzip.16         q10, q11
+    vzip.16         q8,  q9
+    vst1.16         {q10, q11},  [r3, :128]!
+    vst1.16         {q8,  q9},   [r3, :128]!
+    bge             8b
+    pop             {r4-r6}
+    vpop            {q4-q7}
+    bx              lr
+endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index dd86fb2..eb582c2 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -104,6 +105,8 @@ void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
 void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
 void integral_init8v_neon( uint16_t *, intptr_t );
 
+void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -226,6 +229,86 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
 }
 #endif // !HIGH_BIT_DEPTH
 
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+
+void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
+                                               int16_t *propagate_amount,
+                                               uint16_t *lowres_costs,
+                                               int16_t *output,
+                                               int bipred_weight, int mb_y,
+                                               int len );
+
+static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
+                                             int16_t (*mvs)[2],
+                                             int16_t *propagate_amount,
+                                             uint16_t *lowres_costs,
+                                             int bipred_weight, int mb_y,
+                                             int len, int list )
+{
+    int16_t *current = h->scratch_buffer2;
+
+    x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
+                                              lowres_costs, current,
+                                              bipred_weight, mb_y, len );
+
+    unsigned stride = h->mb.i_mb_stride;
+    unsigned width = h->mb.i_mb_width;
+    unsigned height = h->mb.i_mb_height;
+
+    for( unsigned i = 0; i < len; current += 32 )
+    {
+        int end = X264_MIN( i+8, len );
+        for( ; i < end; i++, current += 2 )
+        {
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
+                continue;
+
+            unsigned mbx = current[0];
+            unsigned mby = current[1];
+            unsigned idx0 = mbx + mby * stride;
+            unsigned idx2 = idx0 + stride;
+
+            /* Shortcut for the simple/common case of zero MV */
+            if( !M32( mvs[i] ) )
+            {
+                CLIP_ADD( ref_costs[idx0], current[16] );
+                continue;
+            }
+
+            if( mbx < width-1 && mby < height-1 )
+            {
+                CLIP_ADD2( ref_costs+idx0, current+16 );
+                CLIP_ADD2( ref_costs+idx2, current+32 );
+            }
+            else
+            {
+                /* Note: this takes advantage of unsigned representation to
+                 * catch negative mbx/mby. */
+                if( mby < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );
+                }
+                if( mby+1 < height )
+                {
+                    if( mbx < width )
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );
+                    if( mbx+1 < width )
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );
+                }
+            }
+        }
+    }
+}
+
 void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_ARMV6) )
@@ -281,6 +364,9 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->integral_init8h = integral_init8h_neon;
     pf->integral_init4v = integral_init4v_neon;
     pf->integral_init8v = integral_init8v_neon;
+
+    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+    pf->mbtree_propagate_list = x264_mbtree_propagate_list_neon;
 #endif // !HIGH_BIT_DEPTH
 
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-- 
1.7.10.4