[x264-devel] arm: Implement x264_pixel_sa8d_satd_16x16_neon
Martin Storsjö
git at videolan.org
Sun Oct 11 19:01:05 CEST 2015
x264 | branch: master | Martin Storsjö <martin at martin.st> | Tue Aug 25 14:38:17 2015 +0300| [e8b95e92792d9353277995043757430cf3dc3bf7] | committer: Henrik Gramner
arm: Implement x264_pixel_sa8d_satd_16x16_neon
This requires spilling some registers to the stack,
contray to the aarch64 version.
checkasm timing Cortex-A7 A8 A9
sa8d_satd_16x16_neon 12936 6365 7492
sa8d_satd_16x16_separate_neon 14841 6605 8324
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=e8b95e92792d9353277995043757430cf3dc3bf7
---
common/arm/pixel-a.S | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-
common/arm/pixel.h | 1 +
common/pixel.c | 1 +
3 files changed, 69 insertions(+), 1 deletion(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index bbe082d..41559b8 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1130,7 +1130,35 @@ endfunc
SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
.endm
-function x264_sa8d_8x8_neon, export=0
+.macro integrated_satd dst, s0, s1, s2, s3
+ vmov q0, \s0
+ vmov q1, \s1
+ vmov q2, \s2
+ vmov q3, \s3
+
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+
+ SUMSUB_AB q6, q7, q0, q1
+ SUMSUB_AB q0, q1, q2, q3
+
+ vtrn.32 q6, q0
+ vtrn.32 q7, q1
+
+ vabs.s16 q6, q6
+ vabs.s16 q0, q0
+ vabs.s16 q7, q7
+ vabs.s16 q1, q1
+
+ vmax.u16 q6, q6, q0
+ vmax.u16 q7, q7, q1
+
+ vadd.i16 q6, q6, q7
+ vpadal.u16 \dst, q6
+.endm
+
+.macro sa8d_satd_8x8 satd=
+function x264_sa8d_\satd\()8x8_neon, export=0
LOAD_DIFF_8x4 q8, q9, q10, q11
vld1.64 {d7}, [r2], r3
SUMSUB_AB q0, q1, q8, q9
@@ -1150,6 +1178,12 @@ function x264_sa8d_8x8_neon, export=0
vsubl.u8 q15, d0, d1
HADAMARD4_V q12, q13, q14, q15, q0, q1, q2, q3
+
+.ifc \satd, satd_
+ integrated_satd q4, q8, q9, q10, q11
+ integrated_satd q4, q12, q13, q14, q15
+.endif
+
SUMSUB_ABCD q0, q8, q1, q9, q8, q12, q9, q13
SUMSUB_AB q2, q10, q10, q14
vtrn.16 q8, q9
@@ -1185,8 +1219,40 @@ function x264_sa8d_8x8_neon, export=0
vmax.s16 q11, q3, q15
vadd.i16 q8, q8, q9
vadd.i16 q9, q10, q11
+.ifc \satd, satd_
+ vpadal.u16 q5, q8
+ vpadal.u16 q5, q9
+.endif
bx lr
endfunc
+.endm
+
+sa8d_satd_8x8
+sa8d_satd_8x8 satd_
+
+function x264_pixel_sa8d_satd_16x16_neon
+ push {lr}
+ vpush {q4-q7}
+ vmov.u32 q4, #0
+ vmov.u32 q5, #0
+ bl x264_sa8d_satd_8x8_neon
+ bl x264_sa8d_satd_8x8_neon
+ sub r0, r0, r1, lsl #4
+ sub r2, r2, r3, lsl #4
+ add r0, r0, #8
+ add r2, r2, #8
+ bl x264_sa8d_satd_8x8_neon
+ bl x264_sa8d_satd_8x8_neon
+ vadd.u32 d1, d10, d11
+ vadd.u32 d0, d8, d9
+ vpadd.u32 d1, d1, d1
+ vpadd.u32 d0, d0, d0
+ vrshr.u32 d1, d1, #1
+ vmov.32 r1, d0[0]
+ vmov.32 r0, d1[0]
+ vpop {q4-q7}
+ pop {pc}
+endfunc
.macro HADAMARD_AC w h
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 81c21dc..d84808b 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -58,6 +58,7 @@ int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index 9904b17..7da0340 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1375,6 +1375,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _neon );
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
More information about the x264-devel
mailing list