[x264-devel] [PATCH 17/23] aarch64: x264_coeff_level_run{4, 8, 15, 16}
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:45 CET 2014
All functions ~33% faster.
---
common/aarch64/quant-a.S | 77 ++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/quant.h | 4 +++
common/quant.c | 4 +++
3 files changed, 85 insertions(+)
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 2b12304..f80946d 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -496,3 +496,80 @@ function x264_coeff_last64_neon, export=1
sub w0, w3, w2
ret
endfunc
+
+.macro coeff_level_run_start size
+ add x6, x1, #23 // runlevel->mask
+ mov w7, #0
+ mov w8, #0
+ mov w9, #1
+ and x6, x6, #~15
+ mov w4, #\size - 1
+.endm
+
+.macro coeff_level_run shift
+ clz x3, x2
+ subs w4, w4, w3, lsr #\shift
+ str w4, [x1], #4
+1:
+ ldrh w5, [x0, x4, lsl #1]
+ strh w5, [x6], #2
+ add w7, w7, #1
+ lsl w10, w9, w4
+ orr w8, w8, w10
+ b.le 2f
+ add w3, w3, #1 << \shift
+ sub w4, w4, #1
+ and x3, x3, #~((1 << \shift) - 1)
+ lsl x2, x2, x3
+ clz x3, x2
+ subs w4, w4, w3, lsr #\shift
+ b.ge 1b
+2:
+ str w8, [x1]
+ mov w0, w7
+.endm
+
+function x264_coeff_level_run4_aarch64, export=1
+ ldr x2, [x0]
+
+ coeff_level_run_start 4
+
+ coeff_level_run 4
+
+ ret
+endfunc
+
+.macro X264_COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon, export=1
+.if \size == 15
+ sub x0, x0, #2
+.endif
+.if \size < 15
+ .equ shiftw, 3
+ ld1 {v0.8h}, [x0]
+ uqxtn v0.8b, v0.8h
+ cmtst v0.8b, v0.8b, v0.8b
+.else
+ .equ shiftw, 2
+ ld1 {v0.8h,v1.8h}, [x0]
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ cmtst v0.16b, v0.16b, v0.16b
+ shrn v0.8b, v0.8h, #4
+.endif
+ fmov x2, d0
+.if \size == 15
+ add x0, x0, #2
+.endif
+
+ coeff_level_run_start \size
+
+ coeff_level_run shiftw
+
+ ret
+endfunc
+.endm
+
+X264_COEFF_LEVEL_RUN 8
+X264_COEFF_LEVEL_RUN 15
+X264_COEFF_LEVEL_RUN 16
\ No newline at end of file
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index 2699129..ef9e200 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -48,4 +48,8 @@ int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
+int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
+int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
#endif
diff --git a/common/quant.c b/common/quant.c
index 08a29bd..d15c6b7 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -753,9 +753,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->coeff_last4 = x264_coeff_last4_aarch64;
pf->coeff_last8 = x264_coeff_last8_aarch64;
+ pf->coeff_level_run4 = x264_coeff_level_run4_aarch64;
}
if( cpu&X264_CPU_NEON )
{
+ pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+ pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
pf->decimate_score15 = x264_decimate_score15_neon;
pf->decimate_score16 = x264_decimate_score16_neon;
pf->decimate_score64 = x264_decimate_score64_neon;
--
2.1.3
More information about the x264-devel
mailing list