[x264-devel] arm: implement x264_pixel_var_8x16_neon
Janne Grunau
git at videolan.org
Thu Mar 13 21:23:53 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Wed Mar 12 13:16:00 2014 +0100| [e194e40f24f63dcfd60b5037e5a4d38870e3bc8c] | committer: Jason Garrett-Glaser
arm: implement x264_pixel_var_8x16_neon
checkasm --bench on a cortex-a9:
var_8x16_c: 4306
var_8x16_neon: 791
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=e194e40f24f63dcfd60b5037e5a4d38870e3bc8c
---
common/arm/pixel-a.S | 32 ++++++++++++++++++++++++++++++++
common/arm/pixel.h | 1 +
common/pixel.c | 1 +
3 files changed, 34 insertions(+)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 644e449..6bc904d 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -519,6 +519,38 @@ function x264_pixel_var_8x8_neon
b x264_var_end
.endfunc
+function x264_pixel_var_8x16_neon
+ vld1.64 {d16}, [r0,:64], r1
+ vld1.64 {d18}, [r0,:64], r1
+ vmull.u8 q1, d16, d16
+ vmovl.u8 q0, d16
+ vld1.64 {d20}, [r0,:64], r1
+ vmull.u8 q2, d18, d18
+ vaddw.u8 q0, q0, d18
+
+ mov ip, #12
+
+ vld1.64 {d22}, [r0,:64], r1
+ VAR_SQR_SUM q1, q1, q14, d20, vpaddl.u16
+ vld1.64 {d16}, [r0,:64], r1
+ VAR_SQR_SUM q2, q2, q15, d22, vpaddl.u16
+
+1: subs ip, ip, #4
+ vld1.64 {d18}, [r0,:64], r1
+ VAR_SQR_SUM q1, q14, q12, d16
+ vld1.64 {d20}, [r0,:64], r1
+ VAR_SQR_SUM q2, q15, q13, d18
+ vld1.64 {d22}, [r0,:64], r1
+ VAR_SQR_SUM q1, q12, q14, d20
+ beq 2f
+ vld1.64 {d16}, [r0,:64], r1
+ VAR_SQR_SUM q2, q13, q15, d22
+ b 1b
+2:
+ VAR_SQR_SUM q2, q13, q15, d22
+ b x264_var_end
+.endfunc
+
function x264_pixel_var_16x16_neon
vld1.64 {d16-d17}, [r0,:128], r1
vmull.u8 q12, d16, d16
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 1024ee7..acd517b 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -56,6 +56,7 @@ int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
diff --git a/common/pixel.c b/common/pixel.c
index 2e82812..d362879 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1347,6 +1347,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
More information about the x264-devel
mailing list