[x264-devel] [PATCH 2/4] x264_intra_sad_x3_8x8_neon
Måns Rullgård
mans at mansr.com
Tue Jan 31 01:49:33 CET 2012
George Stephanos <gaf.stephanos at gmail.com> writes:
> ---
> common/arm/pixel-a.S | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++
> common/arm/pixel.h | 1 +
> common/pixel.c | 1 +
> 3 files changed, 67 insertions(+), 0 deletions(-)
>
> diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
> index de442e9..ece299c 100644
> --- a/common/arm/pixel-a.S
> +++ b/common/arm/pixel-a.S
> @@ -1299,3 +1299,68 @@ function x264_intra_sad_x3_4x4_armv6
> pop {r4-r6,pc}
> .endfunc
>
> +function x264_intra_sad_x3_8x8_neon
> + push {r4, lr}
> +
> + add r1, #7
> + vld1.8 {d4}, [r1]
> + add r1, #9
> + vrev64.8 d4, d4
> + vld1.64 {d0}, [r1]
Use vld1.8 consistently.
> + mov r3, #4
> + mov r4, #FENC_STRIDE
> +
> + vaddl.u8 q12, d0, d4
> + vadd.u16 d24, d25
> + vshr.u64 d25, d24, #32
> + vadd.u16 d24, d25
> + vshr.u64 d25, d24, #16
> + vadd.u16 d24, d25
Go look up VPADD.
> + vrshr.u16 d24, #4
> + vdup.8 d24, d24[0]
> +
> + vmov.i8 q1, #0
> + vmov.i8 q3, #0
> + vmov.i8 q13, #0
> +
> +.set Y, 0
> +.rept 4
.irpc Y, 0246
> + vld1.64 {d16}, [r0], r4
> + vld1.64 {d17}, [r0], r4
> + vdup.8 d5, d4[Y]
> + vabal.u8 q1, d16, d0
> + vabal.u8 q3, d16, d5
> + vabal.u8 q13, d16, d24
> + vdup.8 d5, d4[Y+1]
> + vabal.u8 q1, d17, d0
> + vabal.u8 q3, d17, d5
> + vabal.u8 q13, d17, d24
> +.set Y, Y+2
> +.endr
> +
> + vadd.u16 d2, d3
> + vshr.u64 d3, d2, #32
> + vadd.u16 d2, d3
> + vshr.u64 d3, d2, #16
> + vadd.u16 d2, d3
Again, VPADD.
> + vst1.16 {d2[0]}, [r2], r3
These stores can be aligned (:32).
> + vadd.u16 d6, d7
> + vshr.u64 d7, d6, #32
> + vadd.u16 d6, d7
> + vshr.u64 d7, d6, #16
> + vadd.u16 d6, d7
> + vst1.16 {d6[0]}, [r2], r3
> +
> + vadd.u16 d26, d27
> + vshr.u64 d27, d26, #32
> + vadd.u16 d26, d27
> + vshr.u64 d27, d26, #16
> + vadd.u16 d26, d27
> + vst1.16 {d26[0]}, [r2]
More VPADD, and do these three horizontal summing sequences interleaved.
Also allocate registers a bit more cleverly so you can use 128-bit
operations.
> + pop {r4, lr}
> + bx lr
pop {r4,pc}
--
Måns Rullgård
mans at mansr.com
More information about the x264-devel
mailing list