[x264-devel] [PATCH 4/4] x264_intra_sad_x3_16x16_neon
George Stephanos
gaf.stephanos at gmail.com
Sat Jan 28 19:51:32 CET 2012
---
common/arm/pixel-a.S | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/pixel.h | 2 +
common/pixel.c | 1 +
3 files changed, 76 insertions(+), 0 deletions(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 8e9b5b1..3443da9 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1491,3 +1491,76 @@ function x264_intra_sad_x3_8x8c_neon
pop {r4, pc}
.endfunc
+function x264_intra_sad_x3_16x16_neon
+ push {r4, lr}
+ vmov.i8 q2, #0
+ vmov.i8 q3, #0
+ vmov.i8 q8, #0
+ vmov.i8 q10, #0
+ vmov.i8 q11, #0
+ sub lr, r1, #FDEC_STRIDE
+ mov r3, #FENC_STRIDE
+ vld1.8 {q0}, [lr]
+ mov r4, #FDEC_STRIDE
+ sub lr, r1, #1
+
+.rept 16
+ vld1.8 {q1}, [r0], r3
+ vld1.8 {d28[0]}, [lr], r4
+ vabal.u8 q2, d0, d2
+ vabal.u8 q3, d1, d3
+ vdup.8 q9, d28[0]
+ vaddw.u8 q8, d28
+ vabal.u8 q10, d18, d2
+ vabal.u8 q11, d19, d3
+.endr
+
+ vadd.u16 d4, d6
+ vadd.u16 d20, d22
+ vaddl.u8 q0, d0, d1
+ vadd.u16 d5, d7
+ vadd.u16 d21, d23
+ vadd.u16 d0, d1
+ vadd.u16 d4, d5
+ vadd.u16 d20, d21
+ vshr.u64 d1, d0, #32
+ vshr.u64 d5, d4, #32
+ vshr.u64 d21, d20, #32
+ vadd.u16 d0, d1
+ vadd.u16 d4, d5
+ vadd.u16 d20, d21
+ vshr.u64 d1, d0, #16
+ vshr.u64 d5, d4, #16
+ vshr.u64 d21, d20, #16
+ vadd.u16 d0, d1
+ vadd.u16 d4, d5
+ vadd.u16 d20, d21
+ vadd.u16 d0, d16
+
+ vst1.16 {d4[0]}, [r2]
+ vmov.i8 q12, #0
+ add r2, #4
+ vmov.i8 q13, #0
+ vst1.16 {d20[0]}, [r2]
+
+ vrshr.u16 d0, #5
+ sub r0, r0, r3, lsl #4
+ vdup.8 d0, d0[0]
+
+.rept 16
+ vld1.8 {q1}, [r0], r3
+ vabal.u8 q12, d0, d2
+ vabal.u8 q13, d0, d3
+.endr
+ add r2, #4
+ vadd.u16 d24, d26
+ vadd.u16 d25, d27
+ vadd.u16 d24, d25
+ vshr.u64 d25, d24, #32
+ vadd.u16 d24, d25
+ vshr.u64 d25, d24, #16
+ vadd.u16 d24, d25
+ vst1.16 {d24[0]}, [r2]
+
+ pop {r4, pc}
+.endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 506cf59..f29ddb3 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -72,4 +72,6 @@ float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_neon( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_16x16_neon( uint8_t *, uint8_t *, int * );
+
#endif
diff --git a/common/pixel.c b/common/pixel.c
index f6d6a04..d4ae1df 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1214,6 +1214,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
INIT5( sad, _neon );
INIT5( sad_aligned, _neon );
INIT7( sad_x3, _neon );
--
1.7.4.1
More information about the x264-devel
mailing list