[x264-devel] [PATCH 16/23] aarch64: NEON asm for intra luma deblocking
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:44 CET 2014
deblock_luma_intra[0]_neon is 2 times fastes,
deblock_luma_intra[1]_neon is ~4 times faster.
---
common/aarch64/deblock-a.S | 206 ++++++++++++++++++++++++++++++++++++++++++---
common/deblock.c | 4 +
2 files changed, 200 insertions(+), 10 deletions(-)
diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S
index 79db5b4..2c707eb 100644
--- a/common/aarch64/deblock-a.S
+++ b/common/aarch64/deblock-a.S
@@ -180,6 +180,202 @@ function x264_deblock_h_luma_neon, export=1
ret
endfunc
+ .macro h264_loop_filter_start_intra
+ orr w4, w2, w3
+ cmp w4, #0
+ b.ne 1f
+ ret
+1:
+ dup v30.16b, w2 // alpha
+ dup v31.16b, w3 // beta
+.endm
+
+.macro h264_loop_filter_luma_intra
+ uabd v16.16b, v7.16b, v0.16b // abs(p0 - q0)
+ uabd v17.16b, v6.16b, v7.16b // abs(p1 - p0)
+ uabd v18.16b, v1.16b, v0.16b // abs(q1 - q0)
+ cmhi v19.16b, v30.16b, v16.16b // < alpha
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ movi v29.16b, #2
+ ushr v30.16b, v30.16b, #2 // alpha >> 2
+ add v30.16b, v30.16b, v29.16b // (alpha >> 2) + 2
+ cmhi v16.16b, v30.16b, v16.16b // < (alpha >> 2) + 2
+
+ and v19.16b, v19.16b, v17.16b
+ and v19.16b, v19.16b, v18.16b
+ shrn v20.8b, v19.8h, #4
+ mov x4, v20.d[0]
+ cbz x4, 9f
+
+ ushll v20.8h, v6.8b, #1
+ ushll v22.8h, v1.8b, #1
+ ushll2 v21.8h, v6.16b, #1
+ ushll2 v23.8h, v1.16b, #1
+ uaddw v20.8h, v20.8h, v7.8b
+ uaddw v22.8h, v22.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v7.16b
+ uaddw2 v23.8h, v23.8h, v0.16b
+ uaddw v20.8h, v20.8h, v1.8b
+ uaddw v22.8h, v22.8h, v6.8b
+ uaddw2 v21.8h, v21.8h, v1.16b
+ uaddw2 v23.8h, v23.8h, v6.16b
+
+ rshrn v24.8b, v20.8h, #2 // p0'_1
+ rshrn v25.8b, v22.8h, #2 // q0'_1
+ rshrn2 v24.16b, v21.8h, #2 // p0'_1
+ rshrn2 v25.16b, v23.8h, #2 // q0'_1
+
+ uabd v17.16b, v5.16b, v7.16b // abs(p2 - p0)
+ uabd v18.16b, v2.16b, v0.16b // abs(q2 - q0)
+ cmhi v17.16b, v31.16b, v17.16b // < beta
+ cmhi v18.16b, v31.16b, v18.16b // < beta
+
+ and v17.16b, v16.16b, v17.16b // if_2 && if_3
+ and v18.16b, v16.16b, v18.16b // if_2 && if_4
+
+ not v30.16b, v17.16b
+ not v31.16b, v18.16b
+
+ and v30.16b, v30.16b, v19.16b // if_1 && !(if_2 && if_3)
+ and v31.16b, v31.16b, v19.16b // if_1 && !(if_2 && if_4)
+
+ and v17.16b, v19.16b, v17.16b // if_1 && if_2 && if_3
+ and v18.16b, v19.16b, v18.16b // if_1 && if_2 && if_4
+
+ //calc p, v7, v6, v5, v4, v17, v7, v6, v5, v4
+ uaddl v26.8h, v5.8b, v7.8b
+ uaddl2 v27.8h, v5.16b, v7.16b
+ uaddw v26.8h, v26.8h, v0.8b
+ uaddw2 v27.8h, v27.8h, v0.16b
+ add v20.8h, v20.8h, v26.8h
+ add v21.8h, v21.8h, v27.8h
+ uaddw v20.8h, v20.8h, v0.8b
+ uaddw2 v21.8h, v21.8h, v0.16b
+ rshrn v20.8b, v20.8h, #3 // p0'_2
+ rshrn2 v20.16b, v21.8h, #3 // p0'_2
+ uaddw v26.8h, v26.8h, v6.8b
+ uaddw2 v27.8h, v27.8h, v6.16b
+ rshrn v21.8b, v26.8h, #2 // p1'_2
+ rshrn2 v21.16b, v27.8h, #2 // p1'_2
+ uaddl v28.8h, v4.8b, v5.8b
+ uaddl2 v29.8h, v4.16b, v5.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v19.8b, v28.8h, #3 // p2'_2
+ rshrn2 v19.16b, v29.8h, #3 // p2'_2
+
+ //calc q, v0, v1, v2, v3, v18, v0, v1, v2, v3
+ uaddl v26.8h, v2.8b, v0.8b
+ uaddl2 v27.8h, v2.16b, v0.16b
+ uaddw v26.8h, v26.8h, v7.8b
+ uaddw2 v27.8h, v27.8h, v7.16b
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v27.8h
+ uaddw v22.8h, v22.8h, v7.8b
+ uaddw2 v23.8h, v23.8h, v7.16b
+ rshrn v22.8b, v22.8h, #3 // q0'_2
+ rshrn2 v22.16b, v23.8h, #3 // q0'_2
+ uaddw v26.8h, v26.8h, v1.8b
+ uaddw2 v27.8h, v27.8h, v1.16b
+ rshrn v23.8b, v26.8h, #2 // q1'_2
+ rshrn2 v23.16b, v27.8h, #2 // q1'_2
+ uaddl v28.8h, v2.8b, v3.8b
+ uaddl2 v29.8h, v2.16b, v3.16b
+ shl v28.8h, v28.8h, #1
+ shl v29.8h, v29.8h, #1
+ add v28.8h, v28.8h, v26.8h
+ add v29.8h, v29.8h, v27.8h
+ rshrn v26.8b, v28.8h, #3 // q2'_2
+ rshrn2 v26.16b, v29.8h, #3 // q2'_2
+
+ bit v7.16b, v24.16b, v30.16b // p0'_1
+ bit v0.16b, v25.16b, v31.16b // q0'_1
+ bit v7.16b, v20.16b, v17.16b // p0'_2
+ bit v6.16b, v21.16b, v17.16b // p1'_2
+ bit v5.16b, v19.16b, v17.16b // p2'_2
+ bit v0.16b, v22.16b, v18.16b // q0'_2
+ bit v1.16b, v23.16b, v18.16b // q1'_2
+ bit v2.16b, v26.16b, v18.16b // q2'_2
+.endm
+
+function x264_deblock_v_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ ld1 {v0.16b}, [x0], x1 // q0
+ ld1 {v1.16b}, [x0], x1 // q1
+ ld1 {v2.16b}, [x0], x1 // q2
+ ld1 {v3.16b}, [x0], x1 // q3
+ sub x0, x0, x1, lsl #3
+ ld1 {v4.16b}, [x0], x1 // p3
+ ld1 {v5.16b}, [x0], x1 // p2
+ ld1 {v6.16b}, [x0], x1 // p1
+ ld1 {v7.16b}, [x0] // p0
+
+ h264_loop_filter_luma_intra
+
+ sub x0, x0, x1, lsl #1
+ st1 {v5.16b}, [x0], x1 // p2
+ st1 {v6.16b}, [x0], x1 // p1
+ st1 {v7.16b}, [x0], x1 // p0
+ st1 {v0.16b}, [x0], x1 // q0
+ st1 {v1.16b}, [x0], x1 // q1
+ st1 {v2.16b}, [x0] // q2
+9:
+ ret
+endfunc
+
+function x264_deblock_h_luma_intra_neon, export=1
+ h264_loop_filter_start_intra
+
+ sub x0, x0, #4
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v7.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v5.d}[1], [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v7.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v1.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v3.d}[1], [x0], x1
+
+ transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ h264_loop_filter_luma_intra
+
+ transpose_8x16.b v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
+
+ sub x0, x0, x1, lsl #4
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x0], x1
+ st1 {v6.8b}, [x0], x1
+ st1 {v7.8b}, [x0], x1
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ st1 {v4.d}[1], [x0], x1
+ st1 {v5.d}[1], [x0], x1
+ st1 {v6.d}[1], [x0], x1
+ st1 {v7.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v1.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+ st1 {v3.d}[1], [x0], x1
+9:
+ ret
+endfunc
+
.macro h264_loop_filter_chroma
dup v22.16b, w2 // alpha
uxtl v24.8h, v24.8b
@@ -342,16 +538,6 @@ function x264_deblock_h_chroma_mbaff_neon, export=1
ret
endfunc
- .macro h264_loop_filter_start_intra
- orr w4, w2, w3
- cmp w4, #0
- b.ne 1f
- ret
-1:
- dup v30.16b, w2 // alpha
- dup v31.16b, w3 // beta
-.endm
-
.macro h264_loop_filter_chroma_intra, width=16
uabd v26.16b, v16.16b, v17.16b // abs(p0 - q0)
uabd v27.16b, v18.16b, v16.16b // abs(p1 - p0)
diff --git a/common/deblock.c b/common/deblock.c
index fc56ee6..c4d29d5 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -744,6 +744,8 @@ void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta);
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta);
void x264_deblock_v_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta);
+void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta);
+void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta);
#endif
#endif
@@ -860,6 +862,8 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
+ pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
+ pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
#endif
pf->deblock_strength = x264_deblock_strength_neon;
}
--
2.1.3
More information about the x264-devel
mailing list