[x264-devel] [PATCH] x264_decimate_score_15/16_neon

Thu Feb 9 00:25:35 CET 2012

---
 common/arm/quant-a.S |   71 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/quant.h   |    2 +
 common/quant.c       |    2 +
 tools/checkasm.c     |    1 +
 4 files changed, 76 insertions(+), 0 deletions(-)

diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index 1c14c86..c2b50ba 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -32,6 +32,9 @@
 pmovmskb_byte:
 .byte 1,2,4,8,16,32,64,128
 .byte 1,2,4,8,16,32,64,128
+decimate_table4:
+.byte 3,2,2,1,1,1,0,0
+.byte 0,0,0,0,0,0,0,0
 
 .text
 
@@ -457,3 +460,71 @@ COEFF_LEVEL_RUN 8
 COEFF_LEVEL_RUN 15
 COEFF_LEVEL_RUN 16
 
+.macro DECIMATE_SCORE size
+function x264_decimate_score\size\()_neon
+    push        {r4, r5, lr}
+    vld1.64     {d0-d3}, [r0,:128]
+    vabs.s16    q0, q0
+    vabs.s16    q1, q1
+    vqmovn.u16  d0, q0
+    vqmovn.u16  d1, q1
+
+    vmov.i8     q1, #2
+    vcge.u8     q1, q0, q1
+    vqmovn.u16  d4, q1
+    vmov        r3, r4, d4
+    orrs        r3, r4
+    movne       r0, #9
+    bne         2f
+
+    movrel      r1, pmovmskb_byte
+    vld1.64     {d2, d3}, [r1]
+
+    vtst.8      q0, q0
+    vand        q0, q1
+
+    vmov.i8     d2, #0
+    vpadd.u8    d0, d2
+    vpadd.u8    d1, d2
+    vpadd.u8    d0, d2
+    vpadd.u8    d1, d2
+    vpadd.u8    d0, d2
+    vpadd.u8    d1, d2
+    vshl.u64    d1, #8
+    vorr.u8     d0, d1
+    vmov.32     r2, d0[0]
+
+    mov         r0, #0
+
+    movs        r2, r2
+    beq         2f
+.if \size == 15
+    lsr         r2, #1
+.endif
+    orr         r2, #0xf0000000
+    ror         r2, #\size
+
+    movrel      r4, decimate_table4
+    mov         r5, #\size
+    clz         r3, r2
+    add         r3, #1
+    lsl         r2, r3
+    sub         r5, r3
+1:
+    clz         r3, r2
+    ldrb        r1, [r4, r3]
+    add         r0, r1
+    add         r3, #1
+    lsl         r2, r3
+    subs        r5, r3
+    bge         1b
+    b           2f
+2:
+    pop {r4,r5, pc}
+    bx lr
+.endfunc
+.endm
+
+DECIMATE_SCORE 15
+DECIMATE_SCORE 16
+
diff --git a/common/arm/quant.h b/common/arm/quant.h
index a548d15..db48b25 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -47,4 +47,6 @@ int x264_coeff_level_run8_neon( int16_t * );
 int x264_coeff_level_run15_neon( int16_t * );
 int x264_coeff_level_run16_neon( int16_t * );
 
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
 #endif
diff --git a/common/quant.c b/common/quant.c
index 5a19d73..605cf28 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -710,6 +710,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_level_run8 = x264_coeff_level_run8_neon;
         pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
         pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
+        pf->decimate_score15 = x264_decimate_score15_neon;
+        pf->decimate_score16 = x264_decimate_score16_neon;
     }
 #endif
 #endif // HIGH_BIT_DEPTH
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ae7750f..e65a657 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1968,6 +1968,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             { \
                 ok = 0; \
                 fprintf( stderr, #decname ": [FAILED]\n" ); \
+                fprintf( stderr, "\nresult_a: %d\nresult_c: %d\n", result_a, result_c ); \
                 break; \
             } \
         } \
-- 
1.7.4.1