[x264-devel] [PATCH 16/24] arm: Implement x264_deblock_h_chroma_422_neon

Martin Storsjö martin at martin.st
Thu Aug 13 22:59:37 CEST 2015


checkasm timing       Cortex-A7      A8     A9
deblock_h_chroma_422_c       6928    6194   5172
deblock_h_chroma_422_neon    3697    2720   2641
---
 common/arm/deblock-a.S |   19 +++++++++++++++++++
 common/deblock.c       |    4 ++--
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index 446e678..26e95ed 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -4,6 +4,7 @@
  * Copyright (C) 2009-2015 x264 project
  *
  * Authors: Mans Rullgard <mans at mansr.com>
+ *          Martin Storsjo <martin at martin.st>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -261,6 +262,7 @@ function x264_deblock_h_chroma_neon
     h264_loop_filter_start
 
     sub             r0,  r0,  #4
+deblock_h_chroma:
     vld1.8          {d18}, [r0], r1
     vld1.8          {d16}, [r0], r1
     vld1.8          {d0},  [r0], r1
@@ -290,6 +292,23 @@ function x264_deblock_h_chroma_neon
     bx              lr
 endfunc
 
+function x264_deblock_h_chroma_422_neon
+    ldr             ip, [sp]
+    push            {lr}
+    push            {ip}
+    add             r1,  r1,  r1
+    bl              X(x264_deblock_h_chroma_neon)
+    ldr             ip,  [sp]
+    ldr             ip,  [ip]
+    vdup.32         d24, ip
+    sub             r0,  r0,  r1, lsl #3
+    add             r0,  r0,  r1, lsr #1
+    sub             r0,  r0,  #2
+    bl              deblock_h_chroma
+    pop             {ip}
+    pop             {pc}
+endfunc
+
 function x264_deblock_strength_neon
     ldr             ip,  [sp]
     vmov.i8         q8,  #0
diff --git a/common/deblock.c b/common/deblock.c
index 374e293..83bda62 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -739,8 +739,8 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b
 void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
                                  int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
                                  int mvy_limit, int bframe );
-#if ARCH_AARCH64
 void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#if ARCH_AARCH64
 void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
 void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -873,11 +873,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma[0] = x264_deblock_h_luma_neon;
         pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
         pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
 #if ARCH_AARCH64
         pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
         pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
         pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
-        pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
         pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
         pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
         pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
-- 
1.7.10.4



More information about the x264-devel mailing list