[x264-devel] [PATCH 04/23] aarch64: implement x264_pixel_vsad_neon
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:32 CET 2014
35 times faster than C.
---
common/aarch64/pixel-a.S | 24 ++++++++++++++++++++++++
common/aarch64/pixel.h | 2 ++
common/pixel.c | 1 +
3 files changed, 27 insertions(+)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index 8c7b927..e3ca63d 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -248,6 +248,30 @@ SAD_X_FUNC 4, 16, 8
SAD_X_FUNC 4, 16, 16
+function x264_pixel_vsad_neon, export=1
+ subs w2, w2, #2
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ uabdl v6.8h, v0.8b, v1.8b
+ uabdl2 v7.8h, v0.16b, v1.16b
+ b.le 2f
+1:
+ subs w2, w2, #2
+ ld1 {v0.16b}, [x0], x1
+ uabal v6.8h, v1.8b, v0.8b
+ uabal2 v7.8h, v1.16b, v0.16b
+ ld1 {v1.16b}, [x0], x1
+ b.lt 2f
+ uabal v6.8h, v0.8b, v1.8b
+ uabal2 v7.8h, v0.16b, v1.16b
+ b.gt 1b
+2:
+ add v5.8h, v6.8h, v7.8h
+ uaddlv s0, v5.8h
+ fmov w0, s0
+ ret
+endfunc
+
.macro SSD_START_4
ld1 {v16.s}[0], [x0], x1
ld1 {v17.s}[0], [x2], x3
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 99ab16d..e1f1b7c 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -47,6 +47,8 @@ DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
+int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
+
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index 421a67c..d467151 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1429,6 +1429,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
+ pixf->vsad = x264_pixel_vsad_neon;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
--
2.1.3
More information about the x264-devel
mailing list