[x264-devel] [PATCH 21/24] RFC: arm: Implement x264_pixel_sa8d_satd_16x16_neon

Thu Aug 13 22:59:42 CEST 2015

This requires spilling some registers to the stack,
contray to the aarch64 version.

checkasm timing        Cortex-A7      A8     A9
sa8d_satd_16x16_neon          14393   7427   9146
sa8d_satd_16x16_separate_neon 14624   7074   8294
---
 common/arm/pixel-a.S |   54 +++++++++++++++++++++++++++++++++++++++++++++++++-
 common/arm/pixel.h   |    1 +
 common/pixel.c       |    1 +
 3 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index f60fdb5..0376bf2 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1132,7 +1132,8 @@ endfunc
     SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
 .endm
 
-function x264_sa8d_8x8_neon, export=0
+.macro sa8d_satd_8x8 satd=
+function x264_sa8d_\satd\()8x8_neon, export=0
     LOAD_DIFF_8x4   q8,  q9,  q10, q11
     vld1.64         {d7}, [r2], r3
     SUMSUB_AB       q0,  q1,  q8,  q9
@@ -1151,6 +1152,22 @@ function x264_sa8d_8x8_neon, export=0
     vld1.64         {d0}, [r0,:64], r1
     vsubl.u8        q15, d0,  d1
 
+.ifc \satd, satd_
+    vmov            q6, q8
+    vmov            q7, q9
+    vpush           {q10-q15}
+    mov             ip, lr
+    bl              x264_satd_8x4v_8x8h_neon
+    vpadal.u16      q4, q12
+    vpadal.u16      q4, q13
+    vpadal.u16      q4, q14
+    vpadal.u16      q4, q15
+    mov             lr, ip
+    vpop            {q10-q15}
+    vmov            q8, q6
+    vmov            q9, q7
+.endif
+
     HADAMARD4_V     q12, q13, q14, q15,  q0,  q1,  q2,  q3
     SUMSUB_ABCD     q0,  q8,  q1,  q9,   q8,  q12, q9,  q13
     SUMSUB_AB       q2,  q10, q10, q14
@@ -1187,8 +1204,43 @@ function x264_sa8d_8x8_neon, export=0
     vmax.s16        q11, q3,  q15
     vadd.i16        q8,  q8,  q9
     vadd.i16        q9,  q10, q11
+.ifc \satd, satd_
+    vpadal.u16      q5,  q8
+    vpadal.u16      q5,  q9
+.endif
     bx              lr
 endfunc
+.endm
+
+sa8d_satd_8x8
+sa8d_satd_8x8 satd_
+
+function x264_pixel_sa8d_satd_16x16_neon
+    push            {lr}
+    vpush           {q4-q7}
+    vmov.u32        q4,  #0
+    vmov.u32        q5,  #0
+    vmov.u32        q6,  #0
+    vmov.u32        q7,  #0
+    bl              x264_sa8d_satd_8x8_neon
+    bl              x264_sa8d_satd_8x8_neon
+    sub             r0,  r0,  r1,  lsl #4
+    sub             r2,  r2,  r3,  lsl #4
+    add             r0,  r0,  #8
+    add             r2,  r2,  #8
+    bl              x264_sa8d_satd_8x8_neon
+    bl              x264_sa8d_satd_8x8_neon
+    vadd.u32        d0,  d8,  d9
+    vadd.u32        d2,  d10, d11
+    vpaddl.u32      d0,  d0
+    vpaddl.u32      d2,  d2
+    vpop            {q4-q7}
+    vmov.32         r0,  d2[0]
+    add             r0,  r0,  #1
+    lsr             r0,  r0,  #1
+    vmov.32         r1,  d0[0]
+    pop             {pc}
+endfunc
 
 
 .macro HADAMARD_AC w h
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 81c21dc..d84808b 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -58,6 +58,7 @@ int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
 
 int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index 9904b17..7da0340 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1375,6 +1375,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT4( hadamard_ac, _neon );
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+        pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
         pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
-- 
1.7.10.4