[x264-devel] [PATCH 20/24] RFC: arm: Implement x264_coeff_level_run{4, 8, 15, 16}_neon

Thu Aug 13 22:59:41 CEST 2015

These are mostly actually slower than the plain C versions,
only on Cortex A7 are some of them faster.

checkasm timing       Cortex-A7      A8     A9
coeff_level_run4_c           366     331    296
coeff_level_run4_neon        379     496    416
coeff_level_run8_c           518     476    513
coeff_level_run8_neon        499     574    497
coeff_level_run15_c          858     792    572
coeff_level_run15_neon       836     806    708
coeff_level_run16_c          904     798    597
coeff_level_run16_neon       843     811    751
---
 common/arm/quant-a.S |   97 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/quant.h   |    5 +++
 common/quant.c       |   13 +++----
 3 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index 5ec8c04..ce11fd3 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -5,6 +5,7 @@
  *
  * Authors: David Conrad <lessen42 at gmail.com>
  *          Janne Grunau <janne-x264 at jannau.net>
+ *          Martin Storsjo <martin at martin.st>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -548,6 +549,102 @@ function x264_coeff_last64_neon
     bx          lr
 endfunc
 
+.macro coeff_level_run_start size
+    add         r6,  r1,  #23            @ runlevel->mask
+    mov         r7,  #0
+    mov         r8,  #0
+    mov         r5,  #1
+    and         r6,  r6,  #~15
+    mov         r4,  #\size - 1
+.endm
+
+.macro coeff_level_run shift
+    clz         r3,  r2
+    subs        r4,  r4,  r3, lsr #\shift
+    str         r4,  [r1], #4
+1:
+    lsl         r12, r4,  #1
+    ldrh        r12, [r0, r12]
+    strh        r12, [r6], #2
+    add         r7,  r7,  #1
+    lsl         r12, r5,  r4
+    orr         r8,  r8,  r12
+    ble         2f
+    add         r3,  r3,  #1 << \shift
+    sub         r4,  r4,  #1
+    and         r3,  r3,  #~((1 << \shift) - 1)
+    lsl         r2,  r2,  r3
+    clz         r3,  r2
+    subs        r4,  r4,  r3, lsr #\shift
+    bge         1b
+2:
+    str         r8,  [r1]
+    mov         r0,  r7
+.endm
+
+.macro X264_COEFF_LEVEL_RUN_32b size
+function x264_coeff_level_run\size\()_neon
+    push        {r4-r8}
+.if \size < 8
+    .equ        shiftw, 3
+    vld1.16     {d0}, [r0]
+    vtst.16     d0,  d0
+    vshrn.u16   d0,  q0,  #8
+.else
+    .equ        shiftw, 2
+    vld1.16     {q0}, [r0]
+    vtst.16     q0,  q0
+    vshrn.u16   d0,  q0,  #8
+    vshrn.u16   d0,  q0,  #4
+.endif
+    vmov.32     r2,  d0[0]
+
+    coeff_level_run_start \size
+
+    coeff_level_run shiftw
+
+    pop         {r4-r8}
+    bx          lr
+endfunc
+.endm
+
+.macro X264_COEFF_LEVEL_RUN_1x size
+function x264_coeff_level_run\size\()_neon
+    push        {r4-r8}
+.if \size == 15
+    sub         r0,  r0,  #2
+.endif
+    movrel      r2,  pmovmskb_byte
+    vld1.16     {q0, q1}, [r0, :128]
+    vtst.16     q0,  q0
+    vtst.16     q1,  q1
+    vld1.8      {q2}, [r2, :128]
+    vshrn.u16   d0,  q0,  #8
+    vshrn.u16   d1,  q1,  #8
+    vand        q0,  q2
+    vpadd.u8    d0,  d0,  d1
+    vpadd.u8    d0,  d0,  d1
+    vpadd.u8    d0,  d0,  d1
+    vmov.u16    r2,  d0[0]
+    lsl         r2,  r2,  #16
+.if \size == 15
+    add         r0,  r0,  #2
+.endif
+
+    coeff_level_run_start \size
+
+    coeff_level_run 0
+
+    pop         {r4-r8}
+    bx          lr
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN_32b 4
+X264_COEFF_LEVEL_RUN_32b 8
+X264_COEFF_LEVEL_RUN_1x 15
+X264_COEFF_LEVEL_RUN_1x 16
+
 function x264_denoise_dct_neon
     vpush       {q4-q7}
 1:  subs        r3,  r3,  #16
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 2ec91eb..6c03b59 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -48,6 +48,11 @@ int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+int x264_coeff_level_run4_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
+
 void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
 
 #endif
diff --git a/common/quant.c b/common/quant.c
index be000ec..dcec552 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -735,6 +735,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last4 = x264_coeff_last4_arm;
         pf->coeff_last8 = x264_coeff_last8_arm;
     }
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->coeff_level_run4 = x264_coeff_level_run4_neon;
+    }
 #endif
 #if HAVE_ARMV6 || ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
@@ -754,6 +758,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->decimate_score15 = x264_decimate_score15_neon;
         pf->decimate_score16 = x264_decimate_score16_neon;
         pf->decimate_score64 = x264_decimate_score64_neon;
+        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
     }
 #endif
 #if ARCH_AARCH64
@@ -763,12 +770,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last8 = x264_coeff_last8_aarch64;
         pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
     }
-    if( cpu&X264_CPU_NEON )
-    {
-        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
-        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
-        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
-    }
 #endif
 
 #if HAVE_MSA
-- 
1.7.10.4