[x264-devel] [PATCH 2/4] x264_intra_sad_x3_8x8_neon
George Stephanos
gaf.stephanos at gmail.com
Thu Feb 2 03:10:49 CET 2012
---
common/arm/pixel-a.S | 47 +++++++++++++++++++++++++++++++++++++++++++++++
common/arm/pixel.h | 1 +
common/pixel.c | 1 +
3 files changed, 49 insertions(+), 0 deletions(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index da5f36c..995049b 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1289,3 +1289,50 @@ function x264_intra_sad_x3_4x4_armv6
str r5, [r2, #8]
pop {r4-r8,pc}
.endfunc
+
+function x264_intra_sad_x3_8x8_neon
+ add r1, #7
+ vld1.8 {d4}, [r1]
+ add r1, #9
+ vrev64.8 d4, d4
+ vld1.8 {d0}, [r1]
+
+ mov r3, #FENC_STRIDE
+
+ vaddl.u8 q12, d0, d4
+ vadd.u16 d24, d25
+ vmov.i8 q1, #0
+ vpadd.u16 d24, d24
+ vmov.i8 q3, #0
+ vpadd.u16 d24, d24
+ vmov.i8 q13, #0
+ vrshr.u16 d24, #4
+ vdup.8 d24, d24[0]
+
+.irpc Y,0246
+ vld1.8 {d16}, [r0], r3
+ vld1.8 {d17}, [r0], r3
+ vdup.8 d5, d4[\Y]
+ vabal.u8 q1, d16, d0
+ vabal.u8 q3, d16, d5
+ vabal.u8 q13, d16, d24
+ vdup.8 d5, d4[\Y+1]
+ vabal.u8 q1, d17, d0
+ vabal.u8 q3, d17, d5
+ vabal.u8 q13, d17, d24
+.endr
+ vmov.i8 d0, #0
+
+ vadd.u16 d2, d3
+ vadd.u16 d6, d7
+ vadd.u16 d26, d27
+ vpadd.u16 d2, d0
+ vpadd.u16 d6, d0
+ vpadd.u16 d26, d0
+ vpadd.u16 d2, d6
+ vpadd.u16 d26, d26
+ vst1.64 {d2}, [r2]!
+ vst1.32 {d26[0]}, [r2,:32]
+
+ bx lr
+.endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 3e02982..07a72c2 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -70,4 +70,5 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, int,
float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );
#endif
diff --git a/common/pixel.c b/common/pixel.c
index 0949405..af7006f 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1212,6 +1212,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
if( cpu&X264_CPU_NEON )
{
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
INIT5( sad, _neon );
INIT5( sad_aligned, _neon );
INIT7( sad_x3, _neon );
--
1.7.4.1
More information about the x264-devel
mailing list