[x264-devel] [PATCH 1/1] arm: make the combined x264_pixel_sa8d_satd_16x16_neon faster
Janne Grunau
janne-x264 at jannau.net
Wed Aug 19 14:57:48 CEST 2015
On 2015-08-13 23:59:42 +0300, Martin Storsjö wrote:
> This requires spilling some registers to the stack,
> contray to the aarch64 version.
there are barely enough registers to use the same approach as on arm64.
see below
> checkasm timing Cortex-A7 A8 A9
> sa8d_satd_16x16_neon 14393 7427 9146
> sa8d_satd_16x16_separate_neon 14624 7074 8294
which should make the combined version faster on all three cpus, see my
cortex-a9 results below.
Feel free to squash this patch.
Janne
---8<---
on a cortex-a9:
sa8d_satd_16x16_neon: 9162 (without patch)
sa8d_satd_16x16_neon: 7498 (with patch)
sa8d_satd_16x16_separate_neon: 8584
---
common/arm/pixel-a.S | 60 ++++++++++++++++++++++++++++++++--------------------
1 file changed, 37 insertions(+), 23 deletions(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 0376bf2..47ff03b 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1132,6 +1132,33 @@ endfunc
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm
+.macro integrated_satd dst, s0, s1, s2, s3
+ vmov q0, \s0
+ vmov q1, \s1
+ vmov q2, \s2
+ vmov q3, \s3
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+
+ SUMSUB_AB q6, q7, q0, q1
+ SUMSUB_AB q0, q1, q2, q3
+
+ vtrn.32 q6, q0
+ vtrn.32 q7, q1
+
+ vabs.s16 q6, q6
+ vabs.s16 q0, q0
+ vabs.s16 q7, q7
+ vabs.s16 q1, q1
+
+ vmax.u16 q6, q6, q0
+ vmax.u16 q7, q7, q1
+
+ vadd.i16 q6, q6, q7
+ vpadal.u16 \dst, q6
+.endm
+
.macro sa8d_satd_8x8 satd=
function x264_sa8d_\satd\()8x8_neon, export=0
LOAD_DIFF_8x4 q8, q9, q10, q11
@@ -1152,23 +1179,13 @@ function x264_sa8d_\satd\()8x8_neon, export=0
vld1.64 {d0}, [r0,:64], r1
vsubl.u8 q15, d0, d1
+ HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3
+
.ifc \satd, satd_
- vmov q6, q8
- vmov q7, q9
- vpush {q10-q15}
- mov ip, lr
- bl x264_satd_8x4v_8x8h_neon
- vpadal.u16 q4, q12
- vpadal.u16 q4, q13
- vpadal.u16 q4, q14
- vpadal.u16 q4, q15
- mov lr, ip
- vpop {q10-q15}
- vmov q8, q6
- vmov q9, q7
+ integrated_satd q4, q8, q9, q10, q11
+ integrated_satd q4, q12, q13, q14, q15
.endif
- HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3
SUMSUB_ABCD q0, q8, q1, q9, q8, q12, q9, q13
SUMSUB_AB q2, q10, q10, q14
vtrn.16 q8, q9
@@ -1220,8 +1237,6 @@ function x264_pixel_sa8d_satd_16x16_neon
vpush {q4-q7}
vmov.u32 q4, #0
vmov.u32 q5, #0
- vmov.u32 q6, #0
- vmov.u32 q7, #0
bl x264_sa8d_satd_8x8_neon
bl x264_sa8d_satd_8x8_neon
sub r0, r0, r1, lsl #4
@@ -1230,15 +1245,14 @@ function x264_pixel_sa8d_satd_16x16_neon
add r2, r2, #8
bl x264_sa8d_satd_8x8_neon
bl x264_sa8d_satd_8x8_neon
+ vadd.u32 d1, d10, d11
vadd.u32 d0, d8, d9
- vadd.u32 d2, d10, d11
- vpaddl.u32 d0, d0
- vpaddl.u32 d2, d2
- vpop {q4-q7}
- vmov.32 r0, d2[0]
- add r0, r0, #1
- lsr r0, r0, #1
+ vpadd.u32 d1, d1, d1
+ vpadd.u32 d0, d0, d0
+ vrshr.u32 d1, d1, #1
vmov.32 r1, d0[0]
+ vmov.32 r0, d1[0]
+ vpop {q4-q7}
pop {pc}
endfunc
--
2.5.0
More information about the x264-devel
mailing list