[x264-devel] [PATCH 10/11] arm: Implement x284_decimate_score15/16/64_neon
Martin Storsjö
martin at martin.st
Tue Aug 25 13:38:19 CEST 2015
checkasm timing Cortex-A7 A8 A9
decimate_score15_c 764 736 535
decimate_score15_neon 487 494 453
decimate_score16_c 782 727 553
decimate_score16_neon 487 494 521
decimate_score64_c 2361 2597 2011
decimate_score64_neon 1017 802 785
---
Applied most of Janne's comments, except some that were inconclusive
or didn't turn out to help.
---
common/aarch64/quant-a.S | 1 +
common/arm/quant-a.S | 138 ++++++++++++++++++++++++++++++++++++++++++++++
common/arm/quant.h | 4 ++
common/quant.c | 6 +-
4 files changed, 146 insertions(+), 3 deletions(-)
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 3e7e35e..4090900 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -5,6 +5,7 @@
*
* Authors: David Conrad <lessen42 at gmail.com>
* Janne Grunau <janne-x264 at jannau.net>
+ * Martin Storsjo <martin at martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index e63170e..7a2667f 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -32,6 +32,14 @@ pmovmskb_byte:
.byte 1,2,4,8,16,32,64,128
.byte 1,2,4,8,16,32,64,128
+mask_2bit:
+.byte 3,12,48,192,3,12,48,192
+.byte 3,12,48,192,3,12,48,192
+
+mask_1bit:
+.byte 128,64,32,16,8,4,2,1
+.byte 128,64,32,16,8,4,2,1
+
.text
.macro QUANT_TWO bias0 bias1 mf0 mf1 mf2 mf3 mask load_mf=no
@@ -308,6 +316,136 @@ dequant_4x4_dc_rshift:
bx lr
endfunc
+.macro decimate_score_1x size
+function x264_decimate_score\size\()_neon
+ vld1.16 {q0, q1}, [r0, :128]
+ movrel r3, mask_2bit
+ vmov.s8 q3, #0x01
+ vqmovn.s16 d0, q0
+ vqmovn.s16 d1, q1
+ vqabs.s8 q2, q0
+ vld1.8 {q8}, [r3, :128]
+ vceq.s8 q1, q0, #0
+ vcgt.s8 q2, q2, q3
+ vand.u8 q1, q1, q8
+ vshrn.u16 d4, q2, #4
+ vpadd.u8 d2, d2, d3
+ vpadd.u8 d4, d4, d4
+ vpadd.u8 d2, d2, d2
+ vmov.32 r2, d4[0]
+ vmov.32 r1, d2[0]
+ cmp r2, #0
+ beq 0f
+ mov r0, #9
+ bx lr
+0:
+ mvns r1, r1
+ mov r0, #0
+ bxeq lr
+.ifc \size, 15
+ lsr r1, r1, #2
+.endif
+ rbit r1, r1
+ movrel r3, X(x264_decimate_table4)
+1:
+ clz r2, r1
+ lsl r1, r1, r2
+ lsr r12, r2, #1
+ ldrb r2, [r3, r12]
+ lsls r1, r1, #2
+ add r0, r0, r2
+ bne 1b
+ bx lr
+endfunc
+.endm
+
+decimate_score_1x 15
+decimate_score_1x 16
+
+function x264_decimate_score64_neon
+ push {lr}
+ vld1.16 {q8, q9}, [r0, :128]!
+ vld1.16 {q10, q11}, [r0, :128]!
+ vld1.16 {q12, q13}, [r0, :128]!
+ vld1.16 {q14, q15}, [r0, :128]
+ movrel r3, mask_1bit
+ vmov.s8 q3, #0x01
+ vqmovn.s16 d17, q8
+ vqmovn.s16 d16, q9
+ vqmovn.s16 d19, q10
+ vqmovn.s16 d18, q11
+ vqmovn.s16 d21, q12
+ vqmovn.s16 d20, q13
+ vqmovn.s16 d23, q14
+ vqmovn.s16 d22, q15
+ vqabs.s8 q12, q8
+ vqabs.s8 q13, q9
+ vqabs.s8 q14, q10
+ vqabs.s8 q15, q11
+ vld1.8 {q2}, [r3, :128]
+ vceq.s8 q8, q8, #0
+ vceq.s8 q9, q9, #0
+ vceq.s8 q10, q10, #0
+ vceq.s8 q11, q11, #0
+ vmax.s8 q12, q12, q13
+ vmax.s8 q14, q14, q15
+ vand.u8 q8, q8, q2
+ vand.u8 q9, q9, q2
+ vand.u8 q10, q10, q2
+ vand.u8 q11, q11, q2
+ vmax.s8 q12, q12, q14
+ vpadd.u8 d18, d18, d19
+ vpadd.u8 d19, d16, d17
+ vcgt.s8 q12, q12, q3
+ vpadd.u8 d22, d22, d23
+ vpadd.u8 d23, d20, d21
+ vshrn.u16 d24, q12, #4
+ vpadd.u8 d16, d22, d23
+ vpadd.u8 d17, d18, d19
+ vpadd.u8 d24, d24, d24
+ vpadd.u8 d16, d16, d17
+ vmov.32 r2, d24[0]
+ vmov r12, r1, d16
+ cmp r2, #0
+ beq 0f
+ mov r0, #9
+ pop {pc}
+0:
+ mvns r1, r1
+ mvn r12, r12
+ mov r0, #0
+ mov lr, #32
+ movrel r3, X(x264_decimate_table8)
+ beq 2f
+1:
+ clz r2, r1
+ lsl r1, r1, r2
+ sub lr, lr, r2
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ sub lr, lr, #1
+ add r0, r0, r2
+ bne 1b
+2:
+ cmp r12, #0
+ popeq {pc}
+
+ clz r2, r12
+ lsl r1, r12, r2
+ add r2, r2, lr
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ add r0, r0, r2
+ popeq {pc}
+3:
+ clz r2, r1
+ lsl r1, r1, r2
+ ldrb r2, [r3, r2]
+ lsls r1, r1, #1
+ add r0, r0, r2
+ bne 3b
+ pop {pc}
+endfunc
// int coeff_last( int16_t *l )
function x264_coeff_last4_arm
diff --git a/common/arm/quant.h b/common/arm/quant.h
index 78178e8..2ec91eb 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -38,6 +38,10 @@ void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp
void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+int x264_decimate_score15_neon( int16_t * );
+int x264_decimate_score16_neon( int16_t * );
+int x264_decimate_score64_neon( int16_t * );
+
int x264_coeff_last4_arm( int16_t * );
int x264_coeff_last8_arm( int16_t * );
int x264_coeff_last15_neon( int16_t * );
diff --git a/common/quant.c b/common/quant.c
index f8279a7..be000ec 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -751,6 +751,9 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
pf->denoise_dct = x264_denoise_dct_neon;
+ pf->decimate_score15 = x264_decimate_score15_neon;
+ pf->decimate_score16 = x264_decimate_score16_neon;
+ pf->decimate_score64 = x264_decimate_score64_neon;
}
#endif
#if ARCH_AARCH64
@@ -765,9 +768,6 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_level_run8 = x264_coeff_level_run8_neon;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
- pf->decimate_score15 = x264_decimate_score15_neon;
- pf->decimate_score16 = x264_decimate_score16_neon;
- pf->decimate_score64 = x264_decimate_score64_neon;
}
#endif
--
1.7.10.4
More information about the x264-devel
mailing list