[x264-devel] [PATCH 3/4] x264_intra_sad_x3_8x8c_neon
George Stephanos
gaf.stephanos at gmail.com
Thu Feb 2 13:56:43 CET 2012
---
common/arm/pixel-a.S | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/pixel.h | 1 +
common/pixel.c | 1 +
3 files changed, 80 insertions(+), 0 deletions(-)
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 995049b..db32671 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -1336,3 +1336,81 @@ function x264_intra_sad_x3_8x8_neon
bx lr
.endfunc
+
+function x264_intra_sad_x3_8x8c_neon
+ push {r4, lr}
+ vmov.i8 q8, #0
+ vmov.i8 q11, #0
+ add r2, #8
+ sub lr, r1, #FDEC_STRIDE
+ mov r3, #FENC_STRIDE
+ vld1.8 {d0}, [lr]
+ mov r4, #FDEC_STRIDE
+ sub lr, r1, #1
+
+.irp Y,24,25,26,27,28,29,30,31
+ vld1.8 {d\Y}, [r0], r3
+ vld1.8 {d18[]}, [lr], r4
+.if \Y == 24
+ vabdl.u8 q2, d0, d\Y
+ vabdl.u8 q10, d18, d\Y
+.else
+ vabal.u8 q2, d0, d\Y
+ vabal.u8 q10, d18, d\Y
+.endif
+.if \Y < 28
+ vaddw.u8 q8, d18
+.else
+ vaddw.u8 q11, d18
+.endif
+.endr
+
+ vmovl.u8 q3, d0
+ vmov.i8 d17, #0
+ vadd.u16 d20, d21
+ vadd.u16 d4, d5
+ vpadd.u16 d20, d17
+ vpadd.u16 d4, d17
+ vpadd.u16 d20, d17
+ vpadd.u16 d4, d17
+ vpadd.u16 d6, d17
+ vpadd.u16 d7, d17
+ vpadd.u16 d6, d17
+ vpadd.u16 d7, d17
+
+ vst1.32 {d4[0]}, [r2,:32]
+ sub r2, #4
+ vst1.32 {d20[0]}, [r2,:32]
+
+ vadd.u16 d3, d7, d22
+ vadd.u16 d0, d6, d16
+ vrshr.u16 d1, d7, #2
+ vrshr.u16 d3, #3
+ vrshr.u16 d0, #3
+ vrshr.u16 d2, d22, #2
+ vdup.8 d1, d1[0]
+ vdup.8 d3, d3[0]
+ vdup.8 d0, d0[0]
+ vdup.8 d2, d2[0]
+
+ vext.8 d0, d0, d1, #4
+ vext.8 d1, d2, d3, #4
+
+ vabdl.u8 q11, d0, d24
+ vabdl.u8 q10, d0, d25
+ vabal.u8 q11, d0, d26
+ vabal.u8 q10, d0, d27
+ vabal.u8 q11, d1, d28
+ vabal.u8 q10, d1, d29
+ vabal.u8 q11, d1, d30
+ vabal.u8 q10, d1, d31
+
+ sub r2, #4
+ vadd.u16 q11, q10
+ vadd.u16 d22, d23
+ vpadd.u16 d22, d17
+ vpadd.u16 d22, d17
+ vst1.32 {d22[0]}, [r2,:32]
+
+ pop {r4, pc}
+.endfunc
diff --git a/common/arm/pixel.h b/common/arm/pixel.h
index 07a72c2..506cf59 100644
--- a/common/arm/pixel.h
+++ b/common/arm/pixel.h
@@ -71,4 +71,5 @@ float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_neon( uint8_t *, uint8_t *, int * );
#endif
diff --git a/common/pixel.c b/common/pixel.c
index af7006f..f6d6a04 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1213,6 +1213,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_NEON )
{
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
INIT5( sad, _neon );
INIT5( sad_aligned, _neon );
INIT7( sad_x3, _neon );
--
1.7.4.1
More information about the x264-devel
mailing list