[x264-devel] aarch64: NEON asm for 4x16 sad, satd and ssd

Sat Dec 20 21:10:44 CET 2014

x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Thu Aug  7 16:49:12 2014 +0200| [f8f8d13d5978b13fc831e041e52aa617550bbdf3] | committer: Anton Mitrofanov

aarch64: NEON asm for 4x16 sad, satd and ssd

pixel_sad_4x16_neon: 33% faster than C
pixel_satd_4x16_neon: 5 times faster
pixel_ssd_4x16_neon:  4 times faster

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f8f8d13d5978b13fc831e041e52aa617550bbdf3
---

 common/aarch64/pixel-a.S |   57 ++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/pixel.h   |    1 +
 common/pixel.c           |    8 +++----
 3 files changed, 62 insertions(+), 4 deletions(-)

diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index d2c3de6..92912ed 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -114,6 +114,7 @@ endfunc
 
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
+SAD_FUNC  4,  16
 SAD_FUNC  8,  4
 SAD_FUNC  8,  8
 SAD_FUNC  8,  16
@@ -367,6 +368,7 @@ endfunc
 
 SSD_FUNC   4, 4
 SSD_FUNC   4, 8
+SSD_FUNC   4, 16
 SSD_FUNC   8, 4
 SSD_FUNC   8, 8
 SSD_FUNC   8, 16
@@ -895,6 +897,61 @@ function x264_satd_16x4_neon
     b           x264_satd_8x4v_8x8h_neon
 endfunc
 
+function x264_pixel_satd_4x16_neon, export=1
+    mov         x4,  x30
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v16.8h, v0.8b,  v1.8b
+    usubl       v17.8h, v2.8b,  v3.8b
+    usubl       v18.8h, v4.8b,  v5.8b
+    usubl       v19.8h, v6.8b,  v7.8b
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v0.s}[0],  [x0], x1
+    ld1        {v3.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x0], x1
+    ld1        {v5.s}[0],  [x2], x3
+    ld1        {v4.s}[0],  [x0], x1
+    ld1        {v7.s}[0],  [x2], x3
+    ld1        {v6.s}[0],  [x0], x1
+    ld1        {v1.s}[1],  [x2], x3
+    ld1        {v0.s}[1],  [x0], x1
+    ld1        {v3.s}[1],  [x2], x3
+    ld1        {v2.s}[1],  [x0], x1
+    ld1        {v5.s}[1],  [x2], x3
+    ld1        {v4.s}[1],  [x0], x1
+    ld1        {v7.s}[1],  [x2], x3
+    ld1        {v6.s}[1],  [x0], x1
+    usubl       v20.8h, v0.8b,  v1.8b
+    usubl       v21.8h, v2.8b,  v3.8b
+    usubl       v22.8h, v4.8b,  v5.8b
+    usubl       v23.8h, v6.8b,  v7.8b
+
+    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
+
+    bl          x264_satd_8x4v_8x8h_neon
+
+    add         v30.8h, v0.8h,  v1.8h
+    add         v31.8h, v2.8h,  v3.8h
+    add         v0.8h,  v30.8h, v31.8h
+    uaddlv      s0,  v0.8h
+    mov         w0,  v0.s[0]
+    ret         x4
+endfunc
 
 function x264_pixel_sa8d_8x8_neon, export=1
     mov         x4,  x30
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index c7c386a..7d51964 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -33,6 +33,7 @@
     ret x264_pixel_##name##_8x16_##suffix args;\
     ret x264_pixel_##name##_8x8_##suffix args;\
     ret x264_pixel_##name##_8x4_##suffix args;\
+    ret x264_pixel_##name##_4x16_##suffix args;\
     ret x264_pixel_##name##_4x8_##suffix args;\
     ret x264_pixel_##name##_4x4_##suffix args;\
 
diff --git a/common/pixel.c b/common/pixel.c
index bb1894a..6bdbbca 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1409,13 +1409,13 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 #if ARCH_AARCH64
     if( cpu&X264_CPU_NEON )
     {
-        INIT7( sad, _neon );
+        INIT8( sad, _neon );
         // AArch64 has no distinct instructions for aligned load/store
-        INIT7_NAME( sad_aligned, sad, _neon );
+        INIT8_NAME( sad_aligned, sad, _neon );
         INIT7( sad_x3, _neon );
         INIT7( sad_x4, _neon );
-        INIT7( ssd, _neon );
-        INIT7( satd, _neon );
+        INIT8( ssd, _neon );
+        INIT8( satd, _neon );
         INIT7( satd_x3, _neon );
         INIT7( satd_x4, _neon );
         INIT4( hadamard_ac, _neon );