[x264-devel] aarch64: x264_deblock_h_chroma_mbaff_neon

Janne Grunau git at videolan.org
Sat Dec 20 21:10:46 CET 2014


x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Mon Oct 13 12:43:50 2014 +0200| [44cb1dcdbdaafeddd98d2ebe3d02408bc380713e] | committer: Anton Mitrofanov

aarch64: x264_deblock_h_chroma_mbaff_neon

deblock_chroma_420_mbaff_neon  2 times faster

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=44cb1dcdbdaafeddd98d2ebe3d02408bc380713e
---

 common/aarch64/deblock-a.S |   54 ++++++++++++++++++++++++++++++++++++++++++++
 common/deblock.c           |    2 ++
 2 files changed, 56 insertions(+)

diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S
index 9bcd6ad..9618665 100644
--- a/common/aarch64/deblock-a.S
+++ b/common/aarch64/deblock-a.S
@@ -275,6 +275,60 @@ function x264_deblock_h_chroma_neon, export=1
     ret
 endfunc
 
+.macro h264_loop_filter_chroma8
+    dup             v22.8b,  w2                 // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.8b,  v16.8b,  v17.8b    // abs(p0 - q0)
+    uxtl            v4.8h,   v17.8b
+    uabd            v28.8b,  v18.8b,  v16.8b    // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    uabd            v30.8b,  v19.8b,  v17.8b    // abs(q1 - q0)
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    cmhi            v26.8b,  v22.8b,  v26.8b    // < alpha
+    usubw           v4.8h,   v4.8h,   v19.8b
+    dup             v22.8b,  w3                 // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    cmhi            v28.8b,  v22.8b,  v28.8b    // < beta
+    cmhi            v30.8b,  v22.8b,  v30.8b    // < beta
+    smin            v4.8b,   v4.8b,   v24.8b
+    neg             v25.8b,  v24.8b
+    and             v26.8b,  v26.8b,  v28.8b
+    smax            v4.8b,   v4.8b,   v25.8b
+    and             v26.8b,  v26.8b,  v30.8b
+    uxtl            v22.8h,  v17.8b
+    and             v4.8b,   v4.8b,   v26.8b
+    uxtl            v28.8h,  v16.8b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v17.8b,  v22.8h
+.endm
+
+function x264_deblock_h_chroma_mbaff_neon, export=1
+    h264_loop_filter_start
+
+    sub             x4,  x0,  #4
+    sub             x0,  x0,  #2
+
+    ld1             {v18.8b}, [x4], x1
+    ld1             {v16.8b}, [x4], x1
+    ld1             {v17.8b},  [x4], x1
+    ld1             {v19.8b},  [x4]
+
+    transpose4x4.h  v18, v16, v17, v19, v28, v29, v30, v31
+
+    h264_loop_filter_chroma8
+
+    st2             {v16.h,v17.h}[0], [x0], x1
+    st2             {v16.h,v17.h}[1], [x0], x1
+    st2             {v16.h,v17.h}[2], [x0], x1
+    st2             {v16.h,v17.h}[3], [x0]
+
+    ret
+endfunc
+
 .macro h264_loop_filter_start_intra
     orr             w4,  w2,  w3
     cmp             w4,  #0
diff --git a/common/deblock.c b/common/deblock.c
index 101d0bb..b0b8d2b 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -738,6 +738,7 @@ void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X26
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
 #if ARCH_AARCH64
+void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_422_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -852,6 +853,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
 #if ARCH_AARCH64
+        pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
         pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
         pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
         pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;



More information about the x264-devel mailing list