[x264-devel] [PATCH 17/23] aarch64: x264_coeff_level_run{4, 8, 15, 16}

Thu Nov 27 08:56:45 CET 2014

All functions ~33% faster.
---
 common/aarch64/quant-a.S | 77 ++++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/quant.h   |  4 +++
 common/quant.c           |  4 +++
 3 files changed, 85 insertions(+)

diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 2b12304..f80946d 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -496,3 +496,80 @@ function x264_coeff_last64_neon, export=1
     sub         w0,  w3,  w2
     ret
 endfunc
+
+.macro coeff_level_run_start size
+    add         x6,  x1,  #23            // runlevel->mask
+    mov         w7,  #0
+    mov         w8,  #0
+    mov         w9,  #1
+    and         x6,  x6,  #~15
+    mov         w4,  #\size - 1
+.endm
+
+.macro coeff_level_run shift
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    str         w4,  [x1], #4
+1:
+    ldrh        w5,  [x0, x4, lsl #1]
+    strh        w5,  [x6], #2
+    add         w7,  w7,  #1
+    lsl         w10, w9, w4
+    orr         w8,  w8,  w10
+    b.le        2f
+    add         w3,  w3,  #1 << \shift
+    sub         w4,  w4,  #1
+    and         x3,  x3,  #~((1 << \shift) - 1)
+    lsl         x2,  x2,  x3
+    clz         x3,  x2
+    subs        w4,  w4,  w3, lsr #\shift
+    b.ge        1b
+2:
+    str         w8,  [x1]
+    mov         w0,  w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+    ldr         x2,  [x0]
+
+    coeff_level_run_start 4
+
+    coeff_level_run 4
+
+    ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+    sub         x0,  x0,  #2
+.endif
+.if         \size < 15
+    .equ        shiftw, 3
+    ld1         {v0.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    cmtst       v0.8b,  v0.8b,  v0.8b
+.else
+    .equ        shiftw, 2
+    ld1         {v0.8h,v1.8h}, [x0]
+    uqxtn       v0.8b,  v0.8h
+    uqxtn2      v0.16b, v1.8h
+    cmtst       v0.16b, v0.16b, v0.16b
+    shrn        v0.8b,  v0.8h,  #4
+.endif
+    fmov        x2,  d0
+.if \size == 15
+    add         x0,  x0,  #2
+.endif
+
+    coeff_level_run_start \size
+
+    coeff_level_run shiftw
+
+    ret
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN 8
+X264_COEFF_LEVEL_RUN 15
+X264_COEFF_LEVEL_RUN 16
\ No newline at end of file
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index 2699129..ef9e200 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -48,4 +48,8 @@ int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
 #endif
diff --git a/common/quant.c b/common/quant.c
index 08a29bd..d15c6b7 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -753,9 +753,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     {
         pf->coeff_last4 = x264_coeff_last4_aarch64;
         pf->coeff_last8 = x264_coeff_last8_aarch64;
+        pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
     }
     if( cpu&X264_CPU_NEON )
     {
+        pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+        pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+        pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
         pf->decimate_score15 = x264_decimate_score15_neon;
         pf->decimate_score16 = x264_decimate_score16_neon;
         pf->decimate_score64 = x264_decimate_score64_neon;
-- 
2.1.3