[x264-devel] [PATCH 09/11] arm: Implement chroma intra deblock
Martin Storsjö
martin at martin.st
Tue Aug 25 22:36:44 CEST 2015
checkasm timing Cortex-A7 A8 A9
deblock_chroma_420_intra_mbaff_c 1469 1276 1181
deblock_chroma_420_intra_mbaff_neon 981 717 644
deblock_chroma_intra[1]_c 2954 2402 2321
deblock_chroma_intra[1]_neon 947 581 575
deblock_h_chroma_420_intra_c 2859 2509 2264
deblock_h_chroma_420_intra_neon 1480 1119 1028
deblock_h_chroma_422_intra_c 6211 5030 4792
deblock_h_chroma_422_intra_neon 2894 1990 2077
---
Removed extra functions from the listing, returning directly
from the second round in x264_deblock_h_chroma_422_intra_neon.
---
common/arm/deblock-a.S | 116 ++++++++++++++++++++++++++++++++++++++++++++++++
common/deblock.c | 4 +-
2 files changed, 118 insertions(+), 2 deletions(-)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index 1ef708e..5a386a5 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -365,6 +365,122 @@ function x264_deblock_h_chroma_mbaff_neon
bx lr
endfunc
+.macro h264_loop_filter_chroma_intra, width=16
+ vdup.8 q11, r2 @ alpha
+ vabd.u8 q13, q8, q0 @ abs(p0 - q0)
+ vabd.u8 q14, q9, q8 @ abs(p1 - p0)
+ vabd.u8 q15, q1, q0 @ abs(q1 - q0)
+ vclt.u8 q13, q13, q11 @ < alpha
+ vdup.8 q11, r3 @ beta
+ vclt.u8 q14, q14, q11 @ < beta
+ vclt.u8 q15, q15, q11 @ < beta
+ vand q13, q13, q14
+ vand q13, q13, q15
+
+ vshll.u8 q14, d18, #1
+ vshll.u8 q2, d2, #1
+.ifc \width, 16
+ vshll.u8 q15, d19, #1
+ vshll.u8 q3, d3, #1
+ vaddl.u8 q12, d17, d3
+ vaddl.u8 q10, d1, d19
+.endif
+ vaddl.u8 q11, d16, d2
+ vaddl.u8 q1, d18, d0 @ or vaddw q2, to not clobber q1
+ vadd.u16 q14, q14, q11
+ vadd.u16 q2, q2, q1
+.ifc \width, 16
+ vadd.u16 q15, q15, q12
+ vadd.u16 q3, q3, q10
+.endif
+ vqrshrn.u16 d28, q14, #2
+ vqrshrn.u16 d4, q2, #2
+.ifc \width, 16
+ vqrshrn.u16 d29, q15, #2
+ vqrshrn.u16 d5, q3, #2
+.endif
+ vbit q8, q14, q13
+ vbit q0, q2, q13
+.endm
+
+function x264_deblock_v_chroma_intra_neon
+ sub r0, r0, r1, lsl #1
+ vld2.8 {d18,d19}, [r0,:128], r1
+ vld2.8 {d16,d17}, [r0,:128], r1
+ vld2.8 {d0, d1}, [r0,:128], r1
+ vld2.8 {d2, d3}, [r0,:128]
+
+ h264_loop_filter_chroma_intra
+
+ sub r0, r0, r1, lsl #1
+ vst2.8 {d16,d17}, [r0,:128], r1
+ vst2.8 {d0, d1}, [r0,:128], r1
+
+ bx lr
+endfunc
+
+function x264_deblock_h_chroma_intra_neon
+ sub r0, r0, #4
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+ vld1.8 {d19}, [r0], r1
+ vld1.8 {d17}, [r0], r1
+ vld1.8 {d1}, [r0], r1
+ vld1.8 {d3}, [r0], r1
+
+ TRANSPOSE4x4_16 q9, q8, q0, q1
+
+ h264_loop_filter_chroma_intra
+
+ vtrn.16 q8, q0
+
+ sub r0, r0, r1, lsl #3
+ add r0, r0, #2
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0], r1
+ vst1.32 {d17[0]}, [r0], r1
+ vst1.32 {d1[0]}, [r0], r1
+ vst1.32 {d17[1]}, [r0], r1
+ vst1.32 {d1[1]}, [r0], r1
+
+ bx lr
+endfunc
+
+function x264_deblock_h_chroma_422_intra_neon
+ push {lr}
+ bl X(x264_deblock_h_chroma_intra_neon)
+ add r0, r0, #2
+ pop {lr}
+ b X(x264_deblock_h_chroma_intra_neon)
+endfunc
+
+function x264_deblock_h_chroma_intra_mbaff_neon
+ sub r0, r0, #4
+ vld1.8 {d18}, [r0], r1
+ vld1.8 {d16}, [r0], r1
+ vld1.8 {d0}, [r0], r1
+ vld1.8 {d2}, [r0], r1
+
+ TRANSPOSE4x4_16 d18, d16, d0, d2
+
+ h264_loop_filter_chroma_intra, width=8
+
+ vtrn.16 d16, d0
+
+ sub r0, r0, r1, lsl #2
+ add r0, r0, #2
+ vst1.32 {d16[0]}, [r0], r1
+ vst1.32 {d0[0]}, [r0], r1
+ vst1.32 {d16[1]}, [r0], r1
+ vst1.32 {d0[1]}, [r0]
+
+ bx lr
+endfunc
+
function x264_deblock_strength_neon
ldr ip, [sp]
vmov.i8 q8, #0
diff --git a/common/deblock.c b/common/deblock.c
index 1d398ad..46379ec 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -741,11 +741,11 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
int mvy_limit, int bframe );
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
-#if ARCH_AARCH64
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#if ARCH_AARCH64
void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
#endif
@@ -875,11 +875,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
-#if ARCH_AARCH64
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+#if ARCH_AARCH64
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
#endif
--
1.7.10.4
More information about the x264-devel
mailing list