[x264-devel] [PATCH 3/4] x264_intra_sad_x3_8x8c_neon
George Stephanos
gaf.stephanos at gmail.com
Thu Feb 2 13:57:19 CET 2012
1400 to 1070.
On Thu, Feb 2, 2012 at 2:56 PM, George Stephanos <gaf.stephanos at gmail.com>wrote:
> ---
> common/arm/pixel-a.S | 78
> ++++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/pixel.h | 1 +
> common/pixel.c | 1 +
> 3 files changed, 80 insertions(+), 0 deletions(-)
>
> diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
> index 995049b..db32671 100644
> --- a/common/arm/pixel-a.S
> +++ b/common/arm/pixel-a.S
> @@ -1336,3 +1336,81 @@ function x264_intra_sad_x3_8x8_neon
>
> bx lr
> .endfunc
> +
> +function x264_intra_sad_x3_8x8c_neon
> + push {r4, lr}
> + vmov.i8 q8, #0
> + vmov.i8 q11, #0
> + add r2, #8
> + sub lr, r1, #FDEC_STRIDE
> + mov r3, #FENC_STRIDE
> + vld1.8 {d0}, [lr]
> + mov r4, #FDEC_STRIDE
> + sub lr, r1, #1
> +
> +.irp Y,24,25,26,27,28,29,30,31
> + vld1.8 {d\Y}, [r0], r3
> + vld1.8 {d18[]}, [lr], r4
> +.if \Y == 24
> + vabdl.u8 q2, d0, d\Y
> + vabdl.u8 q10, d18, d\Y
> +.else
> + vabal.u8 q2, d0, d\Y
> + vabal.u8 q10, d18, d\Y
> +.endif
> +.if \Y < 28
> + vaddw.u8 q8, d18
> +.else
> + vaddw.u8 q11, d18
> +.endif
> +.endr
> +
> + vmovl.u8 q3, d0
> + vmov.i8 d17, #0
> + vadd.u16 d20, d21
> + vadd.u16 d4, d5
> + vpadd.u16 d20, d17
> + vpadd.u16 d4, d17
> + vpadd.u16 d20, d17
> + vpadd.u16 d4, d17
> + vpadd.u16 d6, d17
> + vpadd.u16 d7, d17
> + vpadd.u16 d6, d17
> + vpadd.u16 d7, d17
> +
> + vst1.32 {d4[0]}, [r2,:32]
> + sub r2, #4
> + vst1.32 {d20[0]}, [r2,:32]
> +
> + vadd.u16 d3, d7, d22
> + vadd.u16 d0, d6, d16
> + vrshr.u16 d1, d7, #2
> + vrshr.u16 d3, #3
> + vrshr.u16 d0, #3
> + vrshr.u16 d2, d22, #2
> + vdup.8 d1, d1[0]
> + vdup.8 d3, d3[0]
> + vdup.8 d0, d0[0]
> + vdup.8 d2, d2[0]
> +
> + vext.8 d0, d0, d1, #4
> + vext.8 d1, d2, d3, #4
> +
> + vabdl.u8 q11, d0, d24
> + vabdl.u8 q10, d0, d25
> + vabal.u8 q11, d0, d26
> + vabal.u8 q10, d0, d27
> + vabal.u8 q11, d1, d28
> + vabal.u8 q10, d1, d29
> + vabal.u8 q11, d1, d30
> + vabal.u8 q10, d1, d31
> +
> + sub r2, #4
> + vadd.u16 q11, q10
> + vadd.u16 d22, d23
> + vpadd.u16 d22, d17
> + vpadd.u16 d22, d17
> + vst1.32 {d22[0]}, [r2,:32]
> +
> + pop {r4, pc}
> +.endfunc
> diff --git a/common/arm/pixel.h b/common/arm/pixel.h
> index 07a72c2..506cf59 100644
> --- a/common/arm/pixel.h
> +++ b/common/arm/pixel.h
> @@ -71,4 +71,5 @@ float x264_pixel_ssim_end4_neon( int sum0[5][4], int
> sum1[5][4], int width );
>
> void x264_intra_sad_x3_4x4_armv6( uint8_t *, uint8_t *, int * );
> void x264_intra_sad_x3_8x8_neon( uint8_t *, uint8_t *, int * );
> +void x264_intra_sad_x3_8x8c_neon( uint8_t *, uint8_t *, int * );
> #endif
> diff --git a/common/pixel.c b/common/pixel.c
> index af7006f..f6d6a04 100644
> --- a/common/pixel.c
> +++ b/common/pixel.c
> @@ -1213,6 +1213,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t
> *pixf )
> if( cpu&X264_CPU_NEON )
> {
> pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
> + pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
> INIT5( sad, _neon );
> INIT5( sad_aligned, _neon );
> INIT7( sad_x3, _neon );
> --
> 1.7.4.1
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x264-devel/attachments/20120202/25cd8cde/attachment.html>
More information about the x264-devel
mailing list