[x264-devel] [PATCH 06/11] arm: Implement x264_deblock_h_chroma_422_neon
Martin Storsjö
martin at martin.st
Tue Aug 25 13:38:15 CEST 2015
checkasm timing Cortex-A7 A8 A9
deblock_h_chroma_422_c 6953 6269 5145
deblock_h_chroma_422_neon 3905 2569 2551
---
Applied Janne's comments, calling h264_loop_filter_start
manually to make sure the early exit skips the whole
function, restoring the stack before the second round
to allow returning directly from there.
---
common/arm/deblock-a.S | 18 ++++++++++++++++++
common/deblock.c | 4 ++--
2 files changed, 20 insertions(+), 2 deletions(-)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index 446e678..a300220 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2015 x264 project
*
* Authors: Mans Rullgard <mans at mansr.com>
+ * Martin Storsjo <martin at martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -261,6 +262,7 @@ function x264_deblock_h_chroma_neon
h264_loop_filter_start
sub r0, r0, #4
+deblock_h_chroma:
vld1.8 {d18}, [r0], r1
vld1.8 {d16}, [r0], r1
vld1.8 {d0}, [r0], r1
@@ -290,6 +292,22 @@ function x264_deblock_h_chroma_neon
bx lr
endfunc
+function x264_deblock_h_chroma_422_neon
+ h264_loop_filter_start
+ push {lr}
+ sub r0, r0, #4
+ add r1, r1, r1
+ bl deblock_h_chroma
+ ldr ip, [sp, #4]
+ ldr ip, [ip]
+ vdup.32 d24, ip
+ sub r0, r0, r1, lsl #3
+ add r0, r0, r1, lsr #1
+ sub r0, r0, #2
+ pop {lr}
+ b deblock_h_chroma
+endfunc
+
function x264_deblock_strength_neon
ldr ip, [sp]
vmov.i8 q8, #0
diff --git a/common/deblock.c b/common/deblock.c
index 374e293..83bda62 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -739,8 +739,8 @@ void x264_deblock_h_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int b
void x264_deblock_strength_neon( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4],
int mvy_limit, int bframe );
-#if ARCH_AARCH64
void x264_deblock_h_chroma_422_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+#if ARCH_AARCH64
void x264_deblock_h_chroma_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_chroma_intra_mbaff_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
void x264_deblock_h_chroma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
@@ -873,11 +873,11 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
+ pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
#if ARCH_AARCH64
pf->deblock_chroma_420_mbaff = x264_deblock_h_chroma_mbaff_neon;
pf->deblock_chroma_420_intra_mbaff = x264_deblock_h_chroma_intra_mbaff_neon;
pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_neon;
- pf->deblock_h_chroma_422 = x264_deblock_h_chroma_422_neon;
pf->deblock_h_chroma_422_intra = x264_deblock_h_chroma_422_intra_neon;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_neon;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_neon;
--
1.7.10.4
More information about the x264-devel
mailing list