[x264-devel] [PATCH 11/24] arm: Implement neon 8x16c intra predict functions

Thu Aug 13 22:59:32 CEST 2015

This implements the same functions as are implemented for 8x8c
and as for 8x16c on aarch64.

Some of the simpler ones actually turn out to be slower than the
plain C version, at least on some CPUs.

checkasm timing       Cortex-A7      A8     A9
intra_predict_8x16c_dc_c     1347    910    1017
intra_predict_8x16c_dc_neon  1271    1366   1247
intra_predict_8x16c_dcl_c    859     677    692
intra_predict_8x16c_dcl_neon 1006    1209   1065
intra_predict_8x16c_dct_c    871     540    590
intra_predict_8x16c_dct_neon 672     511    657
intra_predict_8x16c_h_c      937     712    719
intra_predict_8x16c_h_neon   722     682    672
intra_predict_8x16c_p_c      10184   9967   8652
intra_predict_8x16c_p_neon   2617    1973   1983
intra_predict_8x16c_v_c      610     380    429
intra_predict_8x16c_v_neon   570     513    507
---
 common/arm/predict-a.S |  158 ++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/predict-c.c |   15 +++++
 common/arm/predict.h   |    8 +++
 common/predict.c       |    4 ++
 4 files changed, 185 insertions(+)

diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 7e5d9d3..228fd2e 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -5,6 +5,7 @@
  *
  * Authors: David Conrad <lessen42 at gmail.com>
  *          Mans Rullgard <mans at mansr.com>
+ *          Martin Storsjo <martin at martin.st>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -552,6 +553,163 @@ function x264_predict_8x8c_p_neon
 endfunc
 
 
+function x264_predict_8x16c_dc_top_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {d0}, [r2,:64]
+    vpaddl.u8   d0,  d0
+    vpadd.u16   d0,  d0,  d0
+    vrshrn.u16  d0,  q0,  #2
+    vdup.8      d1,  d0[1]
+    vdup.8      d0,  d0[0]
+    vtrn.32     d0,  d1
+    vmov        q1,  q0
+    b           pred8x16_dc_end
+endfunc
+
+function x264_predict_8x16c_dc_left_neon
+    mov         r1,  #FDEC_STRIDE
+    sub         r2,  r0,  #1
+    ldcol.8     d0,  r2,  r1
+    ldcol.8     d2,  r2,  r1
+    vpaddl.u8   d0,  d0
+    vpaddl.u8   d2,  d2
+    vpadd.u16   d0,  d0,  d0
+    vpadd.u16   d2,  d2,  d2
+    vrshrn.u16  d0,  q0,  #2
+    vrshrn.u16  d2,  q1,  #2
+    vdup.8      d1,  d0[1]
+    vdup.8      d0,  d0[0]
+    vdup.8      d3,  d2[1]
+    vdup.8      d2,  d2[0]
+    b           pred8x16_dc_end
+endfunc
+
+function x264_predict_8x16c_dc_neon
+    sub         r2,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    vld1.8      {d0}, [r2,:64]
+    sub         r2,  r0,  #1
+    ldcol.8     d1,  r2,  r1
+    vdup.32     d2,  d0[1]
+    ldcol.8     d3,  r2,  r1
+    vtrn.32     d0,  d1
+    vtrn.32     d2,  d3
+    vpaddl.u8   q0,  q0
+    vpaddl.u8   q1,  q1
+    vpadd.u16   d0,  d0,  d1
+    vpadd.u16   d2,  d2,  d3
+    vpadd.u16   d1,  d0,  d0
+    vpadd.u16   d3,  d2,  d2
+    vrshrn.u16  d4,  q0,  #3
+    vrshrn.u16  d5,  q0,  #2
+    vrshrn.u16  d6,  q1,  #3
+    vrshrn.u16  d7,  q1,  #2
+    vdup.8      d0,  d4[4]
+    vdup.8      d1,  d5[3]
+    vdup.8      d16, d5[2]
+    vdup.8      d17, d4[5]
+    vtrn.32     q0,  q8
+    vdup.8      d2,  d7[1]
+    vdup.8      d3,  d7[3]
+    vdup.8      d16, d6[4]
+    vdup.8      d17, d6[5]
+    vtrn.32     q1,  q8
+pred8x16_dc_end:
+    add         r2,  r0,  r1,  lsl #2
+.rept 4
+    vst1.8      {d0}, [r0,:64], r1
+    vst1.8      {d1}, [r2,:64], r1
+.endr
+    add         r2,  r2,  r1,  lsl #2
+    add         r0,  r0,  r1,  lsl #2
+.rept 4
+    vst1.8      {d2}, [r0,:64], r1
+    vst1.8      {d3}, [r2,:64], r1
+.endr
+    bx          lr
+endfunc
+
+function x264_predict_8x16c_h_neon
+    sub         r1, r0, #1
+    mov         ip, #FDEC_STRIDE
+.rept 8
+    vld1.8      {d0[]}, [r1], ip
+    vld1.8      {d2[]}, [r1], ip
+    vst1.64     {d0}, [r0,:64], ip
+    vst1.64     {d2}, [r0,:64], ip
+.endr
+    bx          lr
+endfunc
+
+function x264_predict_8x16c_v_neon
+    sub         r0, r0, #FDEC_STRIDE
+    mov         ip, #FDEC_STRIDE
+    vld1.64     {d0}, [r0,:64], ip
+.rept 16
+    vst1.64     {d0}, [r0,:64], ip
+.endr
+    bx          lr
+endfunc
+
+function x264_predict_8x16c_p_neon
+    sub         r3,  r0,  #FDEC_STRIDE
+    mov         r1,  #FDEC_STRIDE
+    add         r2,  r3,  #4
+    sub         r3,  r3,  #1
+    vld1.32     {d0[0]}, [r3]
+    vld1.32     {d2[0]}, [r2,:32], r1
+    ldcol.8     d1,  r3,  r1
+    add         r3,  r3,  r1
+    ldcol.8     d3,  r3,  r1
+    vrev64.32   d16, d3
+    vaddl.u8    q8,  d2,  d16
+    vrev32.8    d0,  d0
+    vsubl.u8    q2,  d2,  d0
+    vrev64.8    d1,  d1
+    vsubl.u8    q3,  d3,  d1
+    movrel      r3,  p16weight
+    vld1.16     {q0}, [r3,:128]
+    vmul.s16    d4,  d4,  d0
+    vmul.s16    q3,  q3,  q0
+    vpadd.i16   d4,  d4,  d5
+    vpadd.i16   d6,  d6,  d7
+    vpaddl.s16  d4,  d4        @ d4[0] = H
+    vpaddl.s16  d6,  d6
+    vpadd.s32   d6,  d6        @ d6[0] = V
+    vshl.i32    d5,  d4,  #4
+    vadd.s32    d4,  d4,  d5   @ d4[0] = 17*H
+    vshl.i32    d7,  d6,  #2
+    vrshrn.s32  d4,  q2,  #5   @ d4[0] = b
+    vadd.s32    d6,  d6,  d7   @ d6[0] = 5*V
+    vrshrn.s32  d6,  q3,  #6   @ d6[0] = c
+    mov         r3,  #0
+    vshl.i16    d3,  d4,  #2
+    vsub.i16    d3,  d3,  d4   @ d2[0] = 3 * b
+    vshl.i16    d2,  d6,  #3
+    vadd.i16    d3,  d3,  d2   @ d2[0] = 3 * b + 8 * c
+    vsub.i16    d3,  d3,  d6   @ d2[0] = 3 * b + 7 * c
+    vrev64.16   d16, d16
+    vadd.i16    d16, d16, d0   @ d16[0] = src[]+src[] + 1
+    vshl.i16    d2,  d16, #4   @ d3[0] = a + 16
+    vsub.i16    d2,  d2,  d3   @ i00
+    vext.16     q0,  q0,  q0,  #7
+    vmov.16     d0[0], r3
+    vmul.i16    q0,  q0,  d4[0]
+    vdup.16     q1,  d2[0]
+    vdup.16     q3,  d6[0]
+    vadd.i16    q1,  q1,  q0
+    mov         r3,  #16
+1:
+    vqshrun.s16 d0,  q1,  #5
+    vadd.i16    q1,  q1,  q3
+    vst1.8      {d0}, [r0,:64], r1
+    subs        r3,  r3,  #1
+    bne         1b
+    bx          lr
+endfunc
+
+
 function x264_predict_16x16_dc_top_neon
     sub         r2,  r0,  #FDEC_STRIDE
     mov         r1,  #FDEC_STRIDE
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
index e0ba0da..b0aedfc 100644
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -61,6 +61,21 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
 #endif // !HIGH_BIT_DEPTH
 }
 
+void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x16c_dc_neon;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x16c_dc_top_neon;
+    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x16c_dc_left_neon;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x16c_h_neon;
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x16c_v_neon;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x16c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
 {
     if (!(cpu&X264_CPU_NEON))
diff --git a/common/arm/predict.h b/common/arm/predict.h
index 242043d..2aa902a 100644
--- a/common/arm/predict.h
+++ b/common/arm/predict.h
@@ -40,6 +40,13 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
 void x264_predict_8x8c_v_neon( uint8_t *src );
 void x264_predict_8x8c_p_neon( uint8_t *src );
 
+void x264_predict_8x16c_v_neon( uint8_t *src );
+void x264_predict_8x16c_h_neon( uint8_t *src );
+void x264_predict_8x16c_dc_neon( uint8_t *src );
+void x264_predict_8x16c_dc_left_neon( uint8_t *src );
+void x264_predict_8x16c_dc_top_neon( uint8_t *src );
+void x264_predict_8x16c_p_neon( uint8_t *src );
+
 void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
@@ -60,6 +67,7 @@ void x264_predict_16x16_p_neon( uint8_t *src );
 void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
 void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
 
 #endif
diff --git a/common/predict.c b/common/predict.c
index c0f2a0b..f7080f0 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -977,6 +977,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
     x264_predict_8x16c_init_mmx( cpu, pf );
 #endif
 
+#if HAVE_ARMV6
+    x264_predict_8x16c_init_arm( cpu, pf );
+#endif
+
 #if ARCH_AARCH64
     x264_predict_8x16c_init_aarch64( cpu, pf );
 #endif
-- 
1.7.10.4