[x264-devel] arm: implement x264_pixel_var2_8x16_neon
Janne Grunau
git at videolan.org
Thu Mar 13 21:23:53 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Wed Mar 12 14:35:31 2014 +0100| [8a3b5338d324a0ae65a2296aa9aa7e80cd3a4392] | committer: Jason Garrett-Glaser
arm: implement x264_pixel_var2_8x16_neon
checkasm --bench on a cortex-a9:
var2_8x16_c: 5677
var2_8x16_neon: 1421
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8a3b5338d324a0ae65a2296aa9aa7e80cd3a4392
---
common/arm/pixel-a.S | 43 +++++++++++++++++++++++++++++++++++++++++++
common/arm/pixel.h | 3 ++-
common/pixel.c | 1 +
3 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 6bc904d..0b996a8 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -635,6 +635,49 @@ function x264_pixel_var2_8x8_neon
bx lr
.endfunc
+function x264_pixel_var2_8x16_neon
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d17}, [r2,:64], r3
+ vld1.64 {d18}, [r0,:64], r1
+ vld1.64 {d19}, [r2,:64], r3
+ vsubl.u8 q10, d16, d17
+ vsubl.u8 q11, d18, d19
+ SQR_ACC q1, d20, d21, vmull.s16
+ vld1.64 {d16}, [r0,:64], r1
+ vadd.s16 q0, q10, q11
+ vld1.64 {d17}, [r2,:64], r3
+ SQR_ACC q2, d22, d23, vmull.s16
+ mov ip, #14
+1: subs ip, ip, #2
+ vld1.64 {d18}, [r0,:64], r1
+ vsubl.u8 q10, d16, d17
+ vld1.64 {d19}, [r2,:64], r3
+ vadd.s16 q0, q0, q10
+ SQR_ACC q1, d20, d21
+ vsubl.u8 q11, d18, d19
+ beq 2f
+ vld1.64 {d16}, [r0,:64], r1
+ vadd.s16 q0, q0, q11
+ vld1.64 {d17}, [r2,:64], r3
+ SQR_ACC q2, d22, d23
+ b 1b
+2:
+ vadd.s16 q0, q0, q11
+ SQR_ACC q2, d22, d23
+
+ ldr ip, [sp]
+ vadd.s16 d0, d0, d1
+ vadd.s32 q1, q1, q2
+ vpaddl.s16 d0, d0
+ vadd.s32 d1, d2, d3
+ vpadd.s32 d0, d0, d1
+
+ vmov r0, r1, d0
+ vst1.32 {d0[1]}, [ip,:32]
+ mul r0, r0, r0
+ sub r0, r1, r0, lsr #7
+ bx lr
+.endfunc
.macro LOAD_DIFF_8x4 q0 q1 q2 q3
vld1.32 {d1}, [r2], r3
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index acd517b..c55ed9a 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -58,7 +58,8 @@ int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index d362879..e16f292 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1350,6 +1350,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
More information about the x264-devel
mailing list