[x264-devel] aarch64: x264_deblock_h_chroma_mbaff_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:46 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Mon Oct 13 12:43:50 2014 +0200| [44cb1dcdbdaafeddd98d2ebe3d02408bc380713e] | committer: Anton Mitrofanov
aarch64: x264_deblock_h_chroma_mbaff_neon
deblock_chroma_420_mbaff_neon 2 times faster
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=44cb1dcdbdaafeddd98d2ebe3d02408bc380713e
---
common/aarch64/deblock-a.S | 54 ++++++++++++++++++++++++++++++++++++++++++++
common/deblock.c | 2 ++
2 files changed, 56 insertions(+)
diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S
index 9bcd6ad..9618665 100644
--- a/common/aarch64/deblock-a.S
+++ b/common/aarch64/deblock-a.S
@@ -275,6 +275,60 @@ function x264_deblock_h_chroma_neon, export=1
ret
endfunc
+.macro h264_loop_filter_chroma8
+ dup v22.8b, w2 // alpha
+ uxtl v24.8h, v24.8b
+ uabd v26.8b, v16.8b, v17.8b // abs(p0 - q0)
+ uxtl v4.8h, v17.8b
+ uabd v28.8b, v18.8b, v16.8b // abs(p1 - p0)
+ usubw v4.8h, v4.8h, v16.8b
+ sli v24.8h, v24.8h, #8
+ shl v4.8h, v4.8h, #2
+ uabd v30.8b, v19.8b, v17.8b // abs(q1 - q0)
+ uaddw v4.8h, v4.8h, v18.8b
+ cmhi v26.8b, v22.8b, v26.8b // < alpha
+ usubw v4.8h, v4.8h, v19.8b
+ dup v22.8b, w3 // beta
+ rshrn v4.8b, v4.8h, #3
+ cmhi v28.8b, v22.8b, v28.8b // < beta
+ cmhi v30.8b, v22.8b, v30.8b // < beta
+ smin v4.8b, v4.8b, v24.8b
+ neg v25.8b, v24.8b
+ and v26.8b, v26.8b, v28.8b
+ smax v4.8b, v4.8b, v25.8b
+ and v26.8b, v26.8b, v30.8b
+ uxtl v22.8h, v17.8b
+ and v4.8b, v4.8b, v26.8b
+ uxtl v28.8h, v16.8b
+ saddw v28.8h, v28.8h, v4.8b
+ ssubw v22.8h, v22.8h, v4.8b
+ sqxtun v16.8b, v28.8h
+ sqxtun v17.8b, v22.8h
+.endm
+
+function x264_deblock_h_chroma_mbaff_neon, export=1
+ h264_loop_filter_start
+
+ sub x4, x0, #4
+ sub x0, x0, #2
+
+ ld1 {v18.8b}, [x4], x1
+ ld1 {v16.8b}, [x4], x1
+ ld1 {v17.8b}, [x4], x1
+ ld1 {v19.8b}, [x4]
+
+ transpose4x4.h v18, v16, v17, v19, v28, v29, v30, v31
+
+ h264_loop_filter_chroma8
+
+ st2 {v16.h,v17.h}[0], [x0], x1
+ st2 {v16.h,v17.h}[1], [x0], x1
+ st2 {v16.h,v17.h}[2], [x0], x1
+ st2 {v16.h,v17.h}[3], [x0]
+
+ ret
+endfunc
+
.macro h264_loop_filter_start_intra
orr w4, w2, w3
cmp w4, #0
diff --git a/common/deblock.c b/common/deblock.c
index 101d0bb..b0b8d2b 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -738,6 +738,7 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
#if ARCH_AARCH64
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -852,6 +853,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
#if ARCH_AARCH64
+ pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
More information about the x264-devel
mailing list