[x264-devel] [PATCH 1/2] arm: implement x264_pixel_var_8x16_neon

Wed Mar 12 18:53:39 CET 2014

checkasm --bench on a cortex-a9:
var_8x16_c: 4306
var_8x16_neon: 791
---
 common/arm/pixel-a.S | 32 ++++++++++++++++++++++++++++++++
 common/arm/pixel.h   |  1 +
 common/pixel.c       |  1 +
 3 files changed, 34 insertions(+)

diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 644e449..6bc904d 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -519,6 +519,38 @@ function x264_pixel_var_8x8_neon
     b               x264_var_end
 .endfunc
 
+function x264_pixel_var_8x16_neon
+    vld1.64         {d16}, [r0,:64], r1
+    vld1.64         {d18}, [r0,:64], r1
+    vmull.u8        q1,  d16, d16
+    vmovl.u8        q0,  d16
+    vld1.64         {d20}, [r0,:64], r1
+    vmull.u8        q2,  d18, d18
+    vaddw.u8        q0,  q0,  d18
+
+    mov             ip,  #12
+
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q1,   q14,  d20, vpaddl.u16
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q2,   q15,  d22, vpaddl.u16
+
+1:  subs            ip,  ip,  #4
+    vld1.64         {d18}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q14,  q12, d16
+    vld1.64         {d20}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q15,  q13, d18
+    vld1.64         {d22}, [r0,:64], r1
+    VAR_SQR_SUM     q1,  q12,  q14, d20
+    beq             2f
+    vld1.64         {d16}, [r0,:64], r1
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               1b
+2:
+    VAR_SQR_SUM     q2,  q13,  q15, d22
+    b               x264_var_end
+.endfunc
+
 function x264_pixel_var_16x16_neon
     vld1.64         {d16-d17}, [r0,:128], r1
     vmull.u8        q12, d16, d16
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 1024ee7..acd517b 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -56,6 +56,7 @@ int x264_pixel_sa8d_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
 
 uint64_t x264_pixel_var_8x8_neon  ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
 uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
 int x264_pixel_var2_8x8_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
 
diff --git a/common/pixel.c b/common/pixel.c
index 2e82812..d362879 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1347,6 +1347,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_neon;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
         pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_neon;
+        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_neon;
         pixf->var[PIXEL_16x16]  = x264_pixel_var_16x16_neon;
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_neon;
 
-- 
1.9.0