[x264-devel] [PATCH 09/11] aarch64: NEON asm for decimate_score
Janne Grunau
janne-x264 at jannau.net
Fri Aug 22 17:26:42 CEST 2014
decimate_score15 and 16 are 60% faster, decimate_score64 is 4 times
faster than C.
---
common/aarch64/quant-a.S | 112 +++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/quant.h | 4 ++
common/quant.c | 6 +++
3 files changed, 122 insertions(+)
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 02b71b2..2b12304 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -300,6 +300,118 @@ dequant_4x4_dc_rshift:
ret
endfunc
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon, export=1
+ ld1 {v0.8h,v1.8h}, [x0]
+ movrel x5, X(x264_decimate_table4)
+ movi v3.16b, #0x01
+ sqxtn v0.8b, v0.8h
+ sqxtn2 v0.16b, v1.8h
+ abs v2.16b, v0.16b
+ cmeq v1.16b, v0.16b, #0
+ cmhi v2.16b, v2.16b, v3.16b
+ shrn v1.8b, v1.8h, #4
+ shrn v2.8b, v2.8h, #4
+ fmov x2, d2
+ fmov x1, d1
+ cbnz x2, 9f
+ mvn x1, x1
+ mov w0, #0
+ cbz x1, 0f
+.ifc \size, 15
+ lsr x1, x1, #1
+.endif
+ rbit x1, x1
+1:
+ clz x3, x1
+ lsr x6, x3, #2
+ lsl x1, x1, x3
+ ldrb w7, [x5, x6]
+ cbz x1, 2f
+ lsl x1, x1, #4
+ add w0, w0, w7
+ cbnz x1, 1b
+ ret
+2:
+ add w0, w0, w7
+0:
+ ret
+9:
+ mov w0, #9
+ ret
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+const mask64, align=6
+ .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+ .byte 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01
+endconst
+
+function x264_decimate_score64_neon, export=1
+ ld1 {v0.8h,v1.8h}, [x0], #32
+ ld1 {v2.8h,v3.8h}, [x0], #32
+ ld1 {v4.8h,v5.8h}, [x0], #32
+ ld1 {v6.8h,v7.8h}, [x0]
+ movrel x6, mask64
+ movi v31.16b, #0x01
+ sqxtn v16.8b, v1.8h
+ sqxtn2 v16.16b, v0.8h
+ sqxtn v17.8b, v3.8h
+ sqxtn2 v17.16b, v2.8h
+ sqxtn v18.8b, v5.8h
+ sqxtn2 v18.16b, v4.8h
+ sqxtn v19.8b, v7.8h
+ sqxtn2 v19.16b, v6.8h
+ abs v4.16b, v16.16b
+ abs v5.16b, v17.16b
+ abs v6.16b, v18.16b
+ abs v7.16b, v19.16b
+ ld1 {v30.16b}, [x6]
+ cmeq v0.16b, v16.16b, #0
+ cmeq v1.16b, v17.16b, #0
+ cmeq v2.16b, v18.16b, #0
+ cmeq v3.16b, v19.16b, #0
+ umax v4.16b, v4.16b, v5.16b
+ umax v6.16b, v6.16b, v7.16b
+ and v0.16b, v0.16b, v30.16b
+ and v1.16b, v1.16b, v30.16b
+ and v2.16b, v2.16b, v30.16b
+ and v3.16b, v3.16b, v30.16b
+ umax v4.16b, v4.16b, v6.16b
+ addp v0.16b, v1.16b, v0.16b
+ addp v2.16b, v3.16b, v2.16b
+ cmhi v4.16b, v4.16b, v31.16b
+ addp v0.16b, v2.16b, v0.16b
+ shrn v4.8b, v4.8h, #4
+ addp v0.16b, v0.16b, v0.16b
+ fmov x2, d4
+ fmov x1, d0
+ cbnz x2, 9f
+ mvn x1, x1
+ mov w0, #0
+ cbz x1, 0f
+ movrel x5, X(x264_decimate_table8)
+1:
+ clz x3, x1
+ lsl x1, x1, x3
+ ldrb w7, [x5, x3]
+ cbz x1, 2f
+ lsl x1, x1, #1
+ add w0, w0, w7
+ cbnz x1, 1b
+ ret
+2:
+ add w0, w0, w7
+0:
+ ret
+9:
+ mov w0, #9
+ ret
+endfunc
+
// int coeff_last( int16_t *l )
function x264_coeff_last4_aarch64, export=1
ldr x2, [x0]
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index dfcac25..2699129 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
int x264_coeff_last4_aarch64( int16_t * );
int x264_coeff_last8_aarch64( int16_t * );
int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c
index d7b6911..42a3a7a 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -754,6 +754,12 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last4 = x264_coeff_last4_aarch64;
pf->coeff_last8 = x264_coeff_last8_aarch64;
}
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->decimate_score15 = x264_decimate_score15_neon;
+ pf->decimate_score16 = x264_decimate_score16_neon;
+ pf->decimate_score64 = x264_decimate_score64_neon;
+ }
#endif
#endif // HIGH_BIT_DEPTH
pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =
--
2.0.4
More information about the x264-devel
mailing list