[x264-devel] arm: Implement x264_deblock_h_chroma_mbaff_neon
Martin Storsjö
git at videolan.org
Sun Oct 11 19:01:05 CEST 2015
x264 | branch: master | Martin Storsjö <martin at martin.st> | Tue Aug 25 14:38:16 2015 +0300| [6bbaa2758d53d0d6d645142d7d818c960d137a0e] | committer: Henrik Gramner
arm: Implement x264_deblock_h_chroma_mbaff_neon
checkasm timing Cortex-A7 A8 A9
deblock_chroma_420_mbaff_c 1944 1706 1526
deblock_chroma_420_mbaff_neon 1210 873 865
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6bbaa2758d53d0d6d645142d7d818c960d137a0e
---
common/arm/deblock-a.S | 57 ++++++++++++++++++++++++++++++++++++++++++++++++
common/deblock.c | 4 ++--
2 files changed, 59 insertions(+), 2 deletions(-)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index a300220..1ef708e 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -308,6 +308,63 @@ function x264_deblock_h_chroma_422_neon
b deblock_h_chroma
endfunc
+.macro h264_loop_filter_chroma8
+ vdup.8 d22, r2 @ alpha
+ vmovl.u8 q12, d24
+ vabd.u8 d26, d16, d0 @ abs(p0 - q0)
+ vabd.u8 d28, d18, d16 @ abs(p1 - p0)
+ vsubl.u8 q2, d0, d16
+ vsli.16 d24, d24, #8
+ vshl.i16 q2, q2, #2
+ vabd.u8 d30, d2, d0 @ abs(q1 - q0)
+ vaddw.u8 q2, q2, d18
+ vclt.u8 d26, d26, d22 @ < alpha
+ vsubw.u8 q2, q2, d2
+ vdup.8 d22, r3 @ beta
+ vclt.s8 d20, d24, #0
+ vrshrn.i16 d4, q2, #3
+ vclt.u8 d28, d28, d22 @ < beta
+ vbic d26, d26, d20
+ vclt.u8 d30, d30, d22 @ < beta
+ vand d26, d26, d28
+ vneg.s8 d20, d24
+ vand d26, d26, d30
+ vmin.s8 d4, d4, d24
+ vmovl.u8 q14, d16
+ vand d4, d4, d26
+ vmax.s8 d4, d4, d20
+ vmovl.u8 q11, d0
+ vaddw.s8 q14, q14, d4
+ vsubw.s8 q11, q11, d4
+ vqmovun.s16 d16, q14
+ vqmovun.s16 d0, q11
+.endm
+
+function x264_deblock_h_chroma_mbaff_neon
+ h264_loop_filter_start
+
+ sub r0, r0, #4
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+
+ TRANSPOSE4x4_16 d18, d16, d0, d2
+
+ h264_loop_filter_chroma8
+
+ vtrn.16 d16, d0
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #2
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0]
+
+ bx lr
+endfunc
+
function x264_deblock_strength_neon
ldr ip, [sp]
vmov.i8 q8, #0
diff --git a/common/deblock.c b/common/deblock.c
index 83bda62..1d398ad 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -740,8 +740,8 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-#if ARCH_AARCH64
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#if ARCH_AARCH64
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -874,8 +874,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
-#if ARCH_AARCH64
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
+#if ARCH_AARCH64
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
More information about the x264-devel
mailing list