[x264-devel] aarch64: NEON asm for decimate_score
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:45 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Tue Aug 12 17:26:10 2014 +0200| [40d5db342b7f5198db9826a51f31e454bd208596] | committer: Anton Mitrofanov
aarch64: NEON asm for decimate_score
decimate_score15 and 16 are 60% faster, decimate_score64 is 4 times
faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=40d5db342b7f5198db9826a51f31e454bd208596
---
common/aarch64/quant-a.S | 113 ++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/quant.h | 5 ++
common/quant.c | 9 +++-
3 files changed, 126 insertions(+), 1 deletion(-)
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 02b71b2..ed9b3ca 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2014 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -300,6 +301,118 @@ dequant_4x4_dc_rshift:
ret
endfunc
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+ ld1 {v0.8h,v1.8h}, [x0]
+ movrel x5, X(x264_decimate_table4)
+ movi v3.16b, #0x01
+ sqxtn v0.8b, v0.8h
+ sqxtn2 v0.16b, v1.8h
+ abs v2.16b, v0.16b
+ cmeq v1.16b, v0.16b, #0
+ cmhi v2.16b, v2.16b, v3.16b
+ shrn v1.8b, v1.8h, #4
+ shrn v2.8b, v2.8h, #4
+ fmov x2, d2
+ fmov x1, d1
+ cbnz x2, 9f
+ mvn x1, x1
+ mov w0, #0
+ cbz x1, 0f
+.ifc \size, 15
+ lsr x1, x1, #1
+.endif
+ rbit x1, x1
+1:
+ clz x3, x1
+ lsr x6, x3, #2
+ lsl x1, x1, x3
+ ldrb w7, [x5, x6]
+ cbz x1, 2f
+ lsl x1, x1, #4
+ add w0, w0, w7
+ cbnz x1, 1b
+ ret
+2:
+ add w0, w0, w7
+0:
+ ret
+9:
+ mov w0, #9
+ ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+ .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+ .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+ ld1 {v0.8h,v1.8h}, [x0], #32
+ ld1 {v2.8h,v3.8h}, [x0], #32
+ ld1 {v4.8h,v5.8h}, [x0], #32
+ ld1 {v6.8h,v7.8h}, [x0]
+ movrel x6, mask64
+ movi v31.16b, #0x01
+ sqxtn v16.8b, v1.8h
+ sqxtn2 v16.16b, v0.8h
+ sqxtn v17.8b, v3.8h
+ sqxtn2 v17.16b, v2.8h
+ sqxtn v18.8b, v5.8h
+ sqxtn2 v18.16b, v4.8h
+ sqxtn v19.8b, v7.8h
+ sqxtn2 v19.16b, v6.8h
+ abs v4.16b, v16.16b
+ abs v5.16b, v17.16b
+ abs v6.16b, v18.16b
+ abs v7.16b, v19.16b
+ ld1 {v30.16b}, [x6]
+ cmeq v0.16b, v16.16b, #0
+ cmeq v1.16b, v17.16b, #0
+ cmeq v2.16b, v18.16b, #0
+ cmeq v3.16b, v19.16b, #0
+ umax v4.16b, v4.16b, v5.16b
+ umax v6.16b, v6.16b, v7.16b
+ and v0.16b, v0.16b, v30.16b
+ and v1.16b, v1.16b, v30.16b
+ and v2.16b, v2.16b, v30.16b
+ and v3.16b, v3.16b, v30.16b
+ umax v4.16b, v4.16b, v6.16b
+ addp v0.16b, v1.16b, v0.16b
+ addp v2.16b, v3.16b, v2.16b
+ cmhi v4.16b, v4.16b, v31.16b
+ addp v0.16b, v2.16b, v0.16b
+ shrn v4.8b, v4.8h, #4
+ addp v0.16b, v0.16b, v0.16b
+ fmov x2, d4
+ fmov x1, d0
+ cbnz x2, 9f
+ mvn x1, x1
+ mov w0, #0
+ cbz x1, 0f
+ movrel x5, X(x264_decimate_table8)
+1:
+ clz x3, x1
+ lsl x1, x1, x3
+ ldrb w7, [x5, x3]
+ cbz x1, 2f
+ lsl x1, x1, #1
+ add w0, w0, w7
+ cbnz x1, 1b
+ ret
+2:
+ add w0, w0, w7
+0:
+ ret
+9:
+ mov w0, #9
+ ret
+endfunc
+
// int coeff_last( int16_t *l )
function x264_coeff_last4_aarch64, export=1
ldr x2, [x0]
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index dfcac25..5a797c1 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -4,6 +4,7 @@
* Copyright (C) 2005-2014 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -38,6 +39,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
int x264_coeff_last4_aarch64( int16_t * );
int x264_coeff_last8_aarch64( int16_t * );
int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c
index 31d8901..d1b89c0 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -714,7 +714,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif // HAVE_MMX
#if HAVE_ALTIVEC
- if( cpu&X264_CPU_ALTIVEC ) {
+ if( cpu&X264_CPU_ALTIVEC )
+ {
pf->quant_2x2_dc = x264_quant_2x2_dc_altivec;
pf->quant_4x4_dc = x264_quant_4x4_dc_altivec;
pf->quant_4x4 = x264_quant_4x4_altivec;
@@ -754,6 +755,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last4 = x264_coeff_last4_aarch64;
pf->coeff_last8 = x264_coeff_last8_aarch64;
}
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->decimate_score15 = x264_decimate_score15_neon;
+ pf->decimate_score16 = x264_decimate_score16_neon;
+ pf->decimate_score64 = x264_decimate_score64_neon;
+ }
#endif
#endif // HIGH_BIT_DEPTH
pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =
More information about the x264-devel
mailing list