[x264-devel] [PATCH 20/24] RFC: arm: Implement x264_coeff_level_run{4, 8, 15, 16}_neon
Martin Storsjö
martin at martin.st
Thu Aug 13 22:59:41 CEST 2015
These are mostly actually slower than the plain C versions,
only on Cortex A7 are some of them faster.
checkasm timing Cortex-A7 A8 A9
coeff_level_run4_c 366 331 296
coeff_level_run4_neon 379 496 416
coeff_level_run8_c 518 476 513
coeff_level_run8_neon 499 574 497
coeff_level_run15_c 858 792 572
coeff_level_run15_neon 836 806 708
coeff_level_run16_c 904 798 597
coeff_level_run16_neon 843 811 751
---
common/arm/quant-a.S | 97 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/quant.h | 5 +++
common/quant.c | 13 +++----
3 files changed, 109 insertions(+), 6 deletions(-)
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index 5ec8c04..ce11fd3 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -5,6 +5,7 @@
*
* Authors: David Conrad <lessen42 at gmail.com>
* Janne Grunau <janne-x264 at jannau.net>
+ * Martin Storsjo <martin at martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -548,6 +549,102 @@ function x264_coeff_last64_neon
bx lr
endfunc
+.macro coeff_level_run_start size
+ add r6, r1, #23 @ runlevel->mask
+ mov r7, #0
+ mov r8, #0
+ mov r5, #1
+ and r6, r6, #~15
+ mov r4, #\size - 1
+.endm
+
+.macro coeff_level_run shift
+ clz r3, r2
+ subs r4, r4, r3, lsr #\shift
+ str r4, [r1], #4
+1:
+ lsl r12, r4, #1
+ ldrh r12, [r0, r12]
+ strh r12, [r6], #2
+ add r7, r7, #1
+ lsl r12, r5, r4
+ orr r8, r8, r12
+ ble 2f
+ add r3, r3, #1 << \shift
+ sub r4, r4, #1
+ and r3, r3, #~((1 << \shift) - 1)
+ lsl r2, r2, r3
+ clz r3, r2
+ subs r4, r4, r3, lsr #\shift
+ bge 1b
+2:
+ str r8, [r1]
+ mov r0, r7
+.endm
+
+.macro X264_COEFF_LEVEL_RUN_32b size
+function x264_coeff_level_run\size\()_neon
+ push {r4-r8}
+.if \size < 8
+ .equ shiftw, 3
+ vld1.16 {d0}, [r0]
+ vtst.16 d0, d0
+ vshrn.u16 d0, q0, #8
+.else
+ .equ shiftw, 2
+ vld1.16 {q0}, [r0]
+ vtst.16 q0, q0
+ vshrn.u16 d0, q0, #8
+ vshrn.u16 d0, q0, #4
+.endif
+ vmov.32 r2, d0[0]
+
+ coeff_level_run_start \size
+
+ coeff_level_run shiftw
+
+ pop {r4-r8}
+ bx lr
+endfunc
+.endm
+
+.macro X264_COEFF_LEVEL_RUN_1x size
+function x264_coeff_level_run\size\()_neon
+ push {r4-r8}
+.if \size == 15
+ sub r0, r0, #2
+.endif
+ movrel r2, pmovmskb_byte
+ vld1.16 {q0, q1}, [r0, :128]
+ vtst.16 q0, q0
+ vtst.16 q1, q1
+ vld1.8 {q2}, [r2, :128]
+ vshrn.u16 d0, q0, #8
+ vshrn.u16 d1, q1, #8
+ vand q0, q2
+ vpadd.u8 d0, d0, d1
+ vpadd.u8 d0, d0, d1
+ vpadd.u8 d0, d0, d1
+ vmov.u16 r2, d0[0]
+ lsl r2, r2, #16
+.if \size == 15
+ add r0, r0, #2
+.endif
+
+ coeff_level_run_start \size
+
+ coeff_level_run 0
+
+ pop {r4-r8}
+ bx lr
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN_32b 4
+X264_COEFF_LEVEL_RUN_32b 8
+X264_COEFF_LEVEL_RUN_1x 15
+X264_COEFF_LEVEL_RUN_1x 16
+
function x264_denoise_dct_neon
vpush {q4-q7}
1: subs r3, r3, #16
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 2ec91eb..6c03b59 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -48,6 +48,11 @@ int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
+int x264_coeff_level_run4_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
+
void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
#endif
diff --git a/common/quant.c b/common/quant.c
index be000ec..dcec552 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -735,6 +735,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last4 = x264_coeff_last4_arm;
pf->coeff_last8 = x264_coeff_last8_arm;
}
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->coeff_level_run4 = x264_coeff_level_run4_neon;
+ }
#endif
#if HAVE_ARMV6 || ARCH_AARCH64
if( cpu&X264_CPU_NEON )
@@ -754,6 +758,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
+ pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
}
#endif
#if ARCH_AARCH64
@@ -763,12 +770,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last8 = x264_coeff_last8_aarch64;
pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
}
- if( cpu&X264_CPU_NEON )
- {
- pf->coeff_level_run8 = x264_coeff_level_run8_neon;
- pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
- pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
- }
#endif
#if HAVE_MSA
--
1.7.10.4
More information about the x264-devel
mailing list