[x264-devel] [PATCH 02/11] arm: Add neon versions of vsad, asd8 and ssd_nv12_core
Martin Storsjö
martin at martin.st
Tue Aug 25 13:38:11 CEST 2015
These are straight translations of the aarch64 versions.
checkasm timing Cortex-A7 A8 A9
vsad_c 16234 10984 9850
vsad_neon 2132 1020 789
asd8_c 5859 3561 3543
asd8_neon 1407 1279 1250
ssd_nv12_c 608096 591072 426285
ssd_nv12_neon 72752 33549 41347
---
Avoid needlessly clobbering registers, use bic instead of and.
---
common/arm/pixel-a.S | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/pixel.h | 6 +++
common/pixel.c | 3 ++
3 files changed, 136 insertions(+)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 36858bc..bbe082d 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2015 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -388,6 +389,59 @@ SAD_X_FUNC 4, 8, 16
SAD_X_FUNC 4, 16, 8
SAD_X_FUNC 4, 16, 16
+function x264_pixel_vsad_neon
+ subs r2, r2, #2
+ vld1.8 {q0}, [r0], r1
+ vld1.8 {q1}, [r0], r1
+ vabdl.u8 q2, d0, d2
+ vabdl.u8 q3, d1, d3
+ ble 2f
+1:
+ subs r2, r2, #2
+ vld1.8 {q0}, [r0], r1
+ vabal.u8 q2, d2, d0
+ vabal.u8 q3, d3, d1
+ vld1.8 {q1}, [r0], r1
+ blt 2f
+ vabal.u8 q2, d0, d2
+ vabal.u8 q3, d1, d3
+ bgt 1b
+2:
+ vadd.u16 q0, q2, q3
+ HORIZ_ADD d0, d0, d1
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x264_pixel_asd8_neon
+ ldr r12, [sp, #0]
+ sub r12, r12, #2
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d1}, [r2], r3
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d3}, [r2], r3
+ vsubl.u8 q8, d0, d1
+1:
+ subs r12, r12, #2
+ vld1.8 {d4}, [r0], r1
+ vld1.8 {d5}, [r2], r3
+ vsubl.u8 q9, d2, d3
+ vsubl.u8 q10, d4, d5
+ vadd.s16 q8, q9
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d3}, [r2], r3
+ vadd.s16 q8, q10
+ bgt 1b
+ vsubl.u8 q9, d2, d3
+ vadd.s16 q8, q9
+ vpaddl.s16 q8, q8
+ vpadd.s32 d16, d16, d17
+ vpadd.s32 d16, d16, d17
+ vabs.s32 d16, d16
+ vmov.32 r0, d16[0]
+ bx lr
+endfunc
+
.macro SSD_START_4
vld1.32 {d16[]}, [r0,:32], r1
@@ -489,6 +543,79 @@ SSD_FUNC 8, 16
SSD_FUNC 16, 8
SSD_FUNC 16, 16
+function x264_pixel_ssd_nv12_core_neon
+ push {r4-r5}
+ ldrd r4, r5, [sp, #8]
+ add r12, r4, #8
+ bic r12, r12, #15
+ vmov.u64 q8, #0
+ vmov.u64 q9, #0
+ sub r1, r1, r12, lsl #1
+ sub r3, r3, r12, lsl #1
+1:
+ subs r12, r4, #16
+ vld2.8 {d0,d1}, [r0]!
+ vld2.8 {d2,d3}, [r2]!
+ vld2.8 {d4,d5}, [r0]!
+ vld2.8 {d6,d7}, [r2]!
+
+ vsubl.u8 q10, d0, d2
+ vsubl.u8 q11, d1, d3
+ vmull.s16 q14, d20, d20
+ vmull.s16 q15, d22, d22
+ vsubl.u8 q12, d4, d6
+ vsubl.u8 q13, d5, d7
+ vmlal.s16 q14, d21, d21
+ vmlal.s16 q15, d23, d23
+
+ blt 4f
+ beq 3f
+2:
+ vmlal.s16 q14, d24, d24
+ vmlal.s16 q15, d26, d26
+ vld2.8 {d0,d1}, [r0]!
+ vld2.8 {d2,d3}, [r2]!
+ vmlal.s16 q14, d25, d25
+ vmlal.s16 q15, d27, d27
+
+ subs r12, r12, #16
+ vsubl.u8 q10, d0, d2
+ vsubl.u8 q11, d1, d3
+ vmlal.s16 q14, d20, d20
+ vmlal.s16 q15, d22, d22
+ vld2.8 {d4,d5}, [r0]!
+ vld2.8 {d6,d7}, [r2]!
+ vmlal.s16 q14, d21, d21
+ vmlal.s16 q15, d23, d23
+ blt 4f
+
+ vsubl.u8 q12, d4, d6
+ vsubl.u8 q13, d5, d7
+ bgt 2b
+3:
+ vmlal.s16 q14, d24, d24
+ vmlal.s16 q15, d26, d26
+ vmlal.s16 q14, d25, d25
+ vmlal.s16 q15, d27, d27
+4:
+ subs r5, r5, #1
+ vaddw.s32 q8, q8, d28
+ vaddw.s32 q9, q9, d30
+ add r0, r0, r1
+ add r2, r2, r3
+ vaddw.s32 q8, q8, d29
+ vaddw.s32 q9, q9, d31
+ bgt 1b
+
+ vadd.u64 d16, d16, d17
+ vadd.u64 d18, d18, d19
+ ldrd r4, r5, [sp, #16]
+ vst1.64 {d16}, [r4]
+ vst1.64 {d18}, [r5]
+
+ pop {r4-r5}
+ bx lr
+endfunc
.macro VAR_SQR_SUM qsqr_sum qsqr_last qsqr dsrc vpadal=vpadal.u16
vmull.u8 \qsqr, \dsrc, \dsrc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index f361b9d..81c21dc 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -52,6 +52,10 @@ DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
+void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
+
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
@@ -71,4 +75,6 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
int sums[2][4] );
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+int x264_pixel_asd8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
#endif
diff --git a/common/pixel.c b/common/pixel.c
index e0ad76c..9904b17 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1380,6 +1380,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
+ pixf->vsad = x264_pixel_vsad_neon;
+ pixf->asd8 = x264_pixel_asd8_neon;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
@@ -1392,6 +1394,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
--
1.7.10.4
More information about the x264-devel
mailing list