[x264-devel] [PATCH 09/11] aarch64: NEON asm for decimate_score

Fri Aug 22 17:26:42 CEST 2014

decimate_score15 and 16 are 60% faster, decimate_score64 is 4 times
faster than C.
---
 common/aarch64/quant-a.S | 112 +++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/quant.h   |   4 ++
 common/quant.c           |   6 +++
 3 files changed, 122 insertions(+)

diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 02b71b2..2b12304 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -300,6 +300,118 @@ dequant_4x4_dc_rshift:
     ret
 endfunc
 
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0]
+    movrel      x5,  X(x264_decimate_table4)
+    movi        v3.16b, #0x01
+    sqxtn       v0.8b,  v0.8h
+    sqxtn2      v0.16b, v1.8h
+    abs         v2.16b, v0.16b
+    cmeq        v1.16b, v0.16b, #0
+    cmhi        v2.16b, v2.16b, v3.16b
+    shrn        v1.8b,  v1.8h,  #4
+    shrn        v2.8b,  v2.8h,  #4
+    fmov        x2,  d2
+    fmov        x1,  d1
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+.ifc \size, 15
+    lsr         x1,  x1,  #1
+.endif
+    rbit        x1,  x1
+1:
+    clz         x3,  x1
+    lsr         x6,  x3,  #2
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x6]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #4
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+    .byte  0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+    ld1        {v0.8h,v1.8h}, [x0], #32
+    ld1        {v2.8h,v3.8h}, [x0], #32
+    ld1        {v4.8h,v5.8h}, [x0], #32
+    ld1        {v6.8h,v7.8h}, [x0]
+    movrel      x6,  mask64
+    movi        v31.16b, #0x01
+    sqxtn       v16.8b,  v1.8h
+    sqxtn2      v16.16b, v0.8h
+    sqxtn       v17.8b,  v3.8h
+    sqxtn2      v17.16b, v2.8h
+    sqxtn       v18.8b,  v5.8h
+    sqxtn2      v18.16b, v4.8h
+    sqxtn       v19.8b,  v7.8h
+    sqxtn2      v19.16b, v6.8h
+    abs         v4.16b, v16.16b
+    abs         v5.16b, v17.16b
+    abs         v6.16b, v18.16b
+    abs         v7.16b, v19.16b
+    ld1        {v30.16b}, [x6]
+    cmeq        v0.16b, v16.16b, #0
+    cmeq        v1.16b, v17.16b, #0
+    cmeq        v2.16b, v18.16b, #0
+    cmeq        v3.16b, v19.16b, #0
+    umax        v4.16b, v4.16b, v5.16b
+    umax        v6.16b, v6.16b, v7.16b
+    and         v0.16b, v0.16b, v30.16b
+    and         v1.16b, v1.16b, v30.16b
+    and         v2.16b, v2.16b, v30.16b
+    and         v3.16b, v3.16b, v30.16b
+    umax        v4.16b, v4.16b, v6.16b
+    addp        v0.16b, v1.16b, v0.16b
+    addp        v2.16b, v3.16b, v2.16b
+    cmhi        v4.16b, v4.16b, v31.16b
+    addp        v0.16b, v2.16b, v0.16b
+    shrn        v4.8b,  v4.8h,  #4
+    addp        v0.16b, v0.16b, v0.16b
+    fmov        x2,  d4
+    fmov        x1,  d0
+    cbnz        x2,  9f
+    mvn         x1,  x1
+    mov         w0,  #0
+    cbz         x1,  0f
+    movrel      x5,  X(x264_decimate_table8)
+1:
+    clz         x3,  x1
+    lsl         x1,  x1,  x3
+    ldrb        w7,  [x5, x3]
+    cbz         x1,  2f
+    lsl         x1,  x1,  #1
+    add         w0,  w0,  w7
+    cbnz        x1,  1b
+    ret
+2:
+    add         w0,  w0,  w7
+0:
+    ret
+9:
+    mov         w0,  #9
+    ret
+endfunc
+
 // int coeff_last( int16_t *l )
 function x264_coeff_last4_aarch64, export=1
     ldr         x2,  [x0]
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index dfcac25..2699129 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
 int x264_coeff_last4_aarch64( int16_t * );
 int x264_coeff_last8_aarch64( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c
index d7b6911..42a3a7a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -754,6 +754,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last4 = x264_coeff_last4_aarch64;
         pf->coeff_last8 = x264_coeff_last8_aarch64;
     }
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
+        pf->decimate_score64 = x264_decimate_score64_neon;
+    }
 #endif
 #endif // HIGH_BIT_DEPTH
     pf->coeff_last[DCT_LUMA_DC]     = pf->coeff_last[DCT_CHROMAU_DC]  = pf->coeff_last[DCT_CHROMAV_DC] =
-- 
2.0.4