[x264-devel] [PATCH 2/4] x264_intra_sad_x3_8x8_neon

George Stephanos gaf.stephanos at gmail.com
Sat Jan 28 19:51:30 CET 2012


---
 common/arm/pixel-a.S |   65 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/pixel.h   |    1 +
 common/pixel.c       |    1 +
 3 files changed, 67 insertions(+), 0 deletions(-)

diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index de442e9..ece299c 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1299,3 +1299,68 @@ function x264_intra_sad_x3_4x4_armv6
     pop         {r4-r6,pc}
 .endfunc
 
+function x264_intra_sad_x3_8x8_neon
+    push        {r4, lr}
+
+    add         r1, #7
+    vld1.8      {d4}, [r1]
+    add         r1, #9
+    vrev64.8    d4, d4
+    vld1.64     {d0}, [r1]
+
+    mov         r3, #4
+    mov         r4, #FENC_STRIDE
+
+    vaddl.u8    q12, d0, d4
+    vadd.u16    d24, d25
+    vshr.u64    d25, d24, #32
+    vadd.u16    d24, d25
+    vshr.u64    d25, d24, #16
+    vadd.u16    d24, d25
+    vrshr.u16   d24, #4
+    vdup.8      d24, d24[0]
+
+    vmov.i8     q1, #0
+    vmov.i8     q3, #0
+    vmov.i8     q13, #0
+
+.set Y, 0
+.rept 4
+    vld1.64     {d16}, [r0], r4
+    vld1.64     {d17}, [r0], r4
+    vdup.8      d5, d4[Y]
+    vabal.u8    q1, d16, d0
+    vabal.u8    q3, d16, d5
+    vabal.u8    q13, d16, d24
+    vdup.8      d5, d4[Y+1]
+    vabal.u8    q1, d17, d0
+    vabal.u8    q3, d17, d5
+    vabal.u8    q13, d17, d24
+.set Y, Y+2
+.endr
+
+    vadd.u16    d2, d3
+    vshr.u64    d3, d2, #32
+    vadd.u16    d2, d3
+    vshr.u64    d3, d2, #16
+    vadd.u16    d2, d3
+    vst1.16     {d2[0]}, [r2], r3
+
+    vadd.u16    d6, d7
+    vshr.u64    d7, d6, #32
+    vadd.u16    d6, d7
+    vshr.u64    d7, d6, #16
+    vadd.u16    d6, d7
+    vst1.16     {d6[0]}, [r2], r3
+
+    vadd.u16    d26, d27
+    vshr.u64    d27, d26, #32
+    vadd.u16    d26, d27
+    vshr.u64    d27, d26, #16
+    vadd.u16    d26, d27
+    vst1.16     {d26[0]}, [r2]
+
+    pop         {r4, lr}
+    bx          lr
+.endfunc
+
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 3e02982..07a72c2 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -70,4 +70,5 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 
 void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );
 #endif
diff --git a/common/pixel.c b/common/pixel.c
index 0949405..af7006f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1212,6 +1212,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     }
     if( cpu&X264_CPU_NEON )
     {
+        pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_neon;
         INIT5( sad, _neon );
         INIT5( sad_aligned, _neon );
         INIT7( sad_x3, _neon );
-- 
1.7.4.1



More information about the x264-devel mailing list