[x264-devel] [PATCH 06/23] aarch64: NEON asm for 4x16 sad, satd and ssd
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:34 CET 2014
pixel_sad_4x16_neon: 33% faster than C
pixel_satd_4x16_neon: 5 times faster
pixel_ssd_4x16_neon: 4 times faster
---
common/aarch64/pixel-a.S | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/pixel.h | 1 +
common/pixel.c | 8 +++----
3 files changed, 62 insertions(+), 4 deletions(-)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index f07021e..d19f595 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -114,6 +114,7 @@ endfunc
SAD_FUNC 4, 4
SAD_FUNC 4, 8
+SAD_FUNC 4, 16
SAD_FUNC 8, 4
SAD_FUNC 8, 8
SAD_FUNC 8, 16
@@ -367,6 +368,7 @@ endfunc
SSD_FUNC 4, 4
SSD_FUNC 4, 8
+SSD_FUNC 4, 16
SSD_FUNC 8, 4
SSD_FUNC 8, 8
SSD_FUNC 8, 16
@@ -895,6 +897,61 @@ function x264_satd_16x4_neon
b x264_satd_8x4v_8x8h_neon
endfunc
+function x264_pixel_satd_4x16_neon, export=1
+ mov x4, x30
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v5.s}[0], [x2], x3
+ ld1 {v4.s}[0], [x0], x1
+ ld1 {v7.s}[0], [x2], x3
+ ld1 {v6.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x0], x1
+ ld1 {v7.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x0], x1
+ usubl v16.8h, v0.8b, v1.8b
+ usubl v17.8h, v2.8b, v3.8b
+ usubl v18.8h, v4.8b, v5.8b
+ usubl v19.8h, v6.8b, v7.8b
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v5.s}[0], [x2], x3
+ ld1 {v4.s}[0], [x0], x1
+ ld1 {v7.s}[0], [x2], x3
+ ld1 {v6.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x0], x1
+ ld1 {v7.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x0], x1
+ usubl v20.8h, v0.8b, v1.8b
+ usubl v21.8h, v2.8b, v3.8b
+ usubl v22.8h, v4.8b, v5.8b
+ usubl v23.8h, v6.8b, v7.8b
+
+ SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
+ SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
+
+ bl x264_satd_8x4v_8x8h_neon
+
+ add v30.8h, v0.8h, v1.8h
+ add v31.8h, v2.8h, v3.8h
+ add v0.8h, v30.8h, v31.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret x4
+endfunc
function x264_pixel_sa8d_8x8_neon, export=1
mov x4, x30
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 4a56013..1eb8718 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -32,6 +32,7 @@
ret x264_pixel_##name##_8x16_##suffix args;\
ret x264_pixel_##name##_8x8_##suffix args;\
ret x264_pixel_##name##_8x4_##suffix args;\
+ ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
ret x264_pixel_##name##_4x4_##suffix args;\
diff --git a/common/pixel.c b/common/pixel.c
index bb1894a..6bdbbca 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1409,13 +1409,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#if ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
- INIT7( sad, _neon );
+ INIT8( sad, _neon );
// AArch64 has no distinct instructions for aligned load/store
- INIT7_NAME( sad_aligned, sad, _neon );
+ INIT8_NAME( sad_aligned, sad, _neon );
INIT7( sad_x3, _neon );
INIT7( sad_x4, _neon );
- INIT7( ssd, _neon );
- INIT7( satd, _neon );
+ INIT8( ssd, _neon );
+ INIT8( satd, _neon );
INIT7( satd_x3, _neon );
INIT7( satd_x4, _neon );
INIT4( hadamard_ac, _neon );
--
2.1.3
More information about the x264-devel
mailing list