[x264-devel] [PATCH 9/9] aarch64: deblocking NEON asm
Janne Grunau
janne-x264 at jannau.net
Sat Jul 19 20:57:55 CEST 2014
Deblock chroma/luma are based on libav's h264 aarch64 NEON deblocking
filter which was ported by me from the existing ARM NEON asm. No
additional persons to ask for a relicense.
---
Makefile | 1 +
common/aarch64/asm.S | 61 +++++++
common/aarch64/deblock-a.S | 392 +++++++++++++++++++++++++++++++++++++++++++++
common/deblock.c | 4 +-
4 files changed, 456 insertions(+), 2 deletions(-)
create mode 100644 common/aarch64/deblock-a.S
diff --git a/Makefile b/Makefile
index 397b54d..171b46d 100644
--- a/Makefile
+++ b/Makefile
@@ -127,6 +127,7 @@ endif
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
ASMSRC += common/aarch64/dct-a.S \
+ common/aarch64/deblock-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/predict-a.S \
diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S
index 5e5aca9..0d0901c 100644
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -100,6 +100,11 @@ ELF .size \name, . - \name
sub \sub, \a, \b
.endm
+.macro unzip t1, t2, s1, s2
+ uzp1 \t1, \s1, \s2
+ uzp2 \t2, \s1, \s2
+.endm
+
.macro transpose t1, t2, s1, s2
trn1 \t1, \s1, \s2
trn2 \t2, \s1, \s2
@@ -151,3 +156,59 @@ ELF .size \name, . - \name
trn1 \r3\().2D, \r9\().2D, \r7\().2D
trn2 \r7\().2D, \r9\().2D, \r7\().2D
.endm
+
+.macro transpose_8x16.b r0, r1, r2, r3, r4, r5, r6, r7, t0, t1
+ trn1 \t0\().16b, \r0\().16b, \r1\().16b
+ trn2 \t1\().16b, \r0\().16b, \r1\().16b
+ trn1 \r1\().16b, \r2\().16b, \r3\().16b
+ trn2 \r3\().16b, \r2\().16b, \r3\().16b
+ trn1 \r0\().16b, \r4\().16b, \r5\().16b
+ trn2 \r5\().16b, \r4\().16b, \r5\().16b
+ trn1 \r2\().16b, \r6\().16b, \r7\().16b
+ trn2 \r7\().16b, \r6\().16b, \r7\().16b
+
+ trn1 \r4\().8h, \r0\().8h, \r2\().8h
+ trn2 \r2\().8h, \r0\().8h, \r2\().8h
+ trn1 \r6\().8h, \r5\().8h, \r7\().8h
+ trn2 \r7\().8h, \r5\().8h, \r7\().8h
+ trn1 \r5\().8h, \t1\().8h, \r3\().8h
+ trn2 \t1\().8h, \t1\().8h, \r3\().8h
+ trn1 \r3\().8h, \t0\().8h, \r1\().8h
+ trn2 \t0\().8h, \t0\().8h, \r1\().8h
+
+ trn1 \r0\().4s, \r3\().4s, \r4\().4s
+ trn2 \r4\().4s, \r3\().4s, \r4\().4s
+
+ trn1 \r1\().4s, \r5\().4s, \r6\().4s
+ trn2 \r5\().4s, \r5\().4s, \r6\().4s
+
+ trn2 \r6\().4s, \t0\().4s, \r2\().4s
+ trn1 \r2\().4s, \t0\().4s, \r2\().4s
+
+ trn1 \r3\().4s, \t1\().4s, \r7\().4s
+ trn2 \r7\().4s, \t1\().4s, \r7\().4s
+.endm
+
+.macro transpose_4x16.b r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().16b, \r0\().16b, \r1\().16b
+ trn2 \t5\().16b, \r0\().16b, \r1\().16b
+ trn1 \t6\().16b, \r2\().16b, \r3\().16b
+ trn2 \t7\().16b, \r2\().16b, \r3\().16b
+
+ trn1 \r0\().8h, \t4\().8h, \t6\().8h
+ trn2 \r2\().8h, \t4\().8h, \t6\().8h
+ trn1 \r1\().8h, \t5\().8h, \t7\().8h
+ trn2 \r3\().8h, \t5\().8h, \t7\().8h
+.endm
+
+.macro transpose_4x8.b r0, r1, r2, r3, t4, t5, t6, t7
+ trn1 \t4\().8b, \r0\().8b, \r1\().8b
+ trn2 \t5\().8b, \r0\().8b, \r1\().8b
+ trn1 \t6\().8b, \r2\().8b, \r3\().8b
+ trn2 \t7\().8b, \r2\().8b, \r3\().8b
+
+ trn1 \r0\().4h, \t4\().4h, \t6\().4h
+ trn2 \r2\().4h, \t4\().4h, \t6\().4h
+ trn1 \r1\().4h, \t5\().4h, \t7\().4h
+ trn2 \r3\().4h, \t5\().4h, \t7\().4h
+.endm
diff --git a/common/aarch64/deblock-a.S b/common/aarch64/deblock-a.S
new file mode 100644
index 0000000..00be8e7
--- /dev/null
+++ b/common/aarch64/deblock-a.S
@@ -0,0 +1,392 @@
+/*****************************************************************************
+ * deblock.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro h264_loop_filter_start
+ cmp w2, #0
+ ldr w6, [x4]
+ ccmp w3, #0, #0, ne
+ mov v24.s[0], w6
+ and w6, w6, w6, lsl #16
+ b.eq 1f
+ ands w6, w6, w6, lsl #8
+ b.ge 2f
+1:
+ ret
+2:
+.endm
+
+.macro h264_loop_filter_luma
+ dup v22.16b, w2 // alpha
+ uxtl v24.8h, v24.8b
+ uabd v21.16b, v16.16b, v0.16b // abs(p0 - q0)
+ uxtl v24.4s, v24.4h
+ uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
+ sli v24.8h, v24.8h, #8
+ uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
+ sli v24.4s, v24.4s, #16
+ cmhi v21.16b, v22.16b, v21.16b // < alpha
+ dup v22.16b, w3 // beta
+ cmlt v23.16b, v24.16b, #0
+ cmhi v28.16b, v22.16b, v28.16b // < beta
+ cmhi v30.16b, v22.16b, v30.16b // < beta
+ bic v21.16b, v21.16b, v23.16b
+ uabd v17.16b, v20.16b, v16.16b // abs(p2 - p0)
+ and v21.16b, v21.16b, v28.16b
+ uabd v19.16b, v4.16b, v0.16b // abs(q2 - q0)
+ cmhi v17.16b, v22.16b, v17.16b // < beta
+ and v21.16b, v21.16b, v30.16b
+ cmhi v19.16b, v22.16b, v19.16b // < beta
+ and v17.16b, v17.16b, v21.16b
+ and v19.16b, v19.16b, v21.16b
+ and v24.16b, v24.16b, v21.16b
+ urhadd v28.16b, v16.16b, v0.16b
+ sub v21.16b, v24.16b, v17.16b
+ uqadd v23.16b, v18.16b, v24.16b
+ uhadd v20.16b, v20.16b, v28.16b
+ sub v21.16b, v21.16b, v19.16b
+ uhadd v28.16b, v4.16b, v28.16b
+ umin v23.16b, v23.16b, v20.16b
+ uqsub v22.16b, v18.16b, v24.16b
+ uqadd v4.16b, v2.16b, v24.16b
+ umax v23.16b, v23.16b, v22.16b
+ uqsub v22.16b, v2.16b, v24.16b
+ umin v28.16b, v4.16b, v28.16b
+ uxtl v4.8h, v0.8b
+ umax v28.16b, v28.16b, v22.16b
+ uxtl2 v20.8h, v0.16b
+ usubw v4.8h, v4.8h, v16.8b
+ usubw2 v20.8h, v20.8h, v16.16b
+ shl v4.8h, v4.8h, #2
+ shl v20.8h, v20.8h, #2
+ uaddw v4.8h, v4.8h, v18.8b
+ uaddw2 v20.8h, v20.8h, v18.16b
+ usubw v4.8h, v4.8h, v2.8b
+ usubw2 v20.8h, v20.8h, v2.16b
+ rshrn v4.8b, v4.8h, #3
+ rshrn2 v4.16b, v20.8h, #3
+ bsl v17.16b, v23.16b, v18.16b
+ bsl v19.16b, v28.16b, v2.16b
+ neg v23.16b, v21.16b
+ uxtl v28.8h, v16.8b
+ smin v4.16b, v4.16b, v21.16b
+ uxtl2 v21.8h, v16.16b
+ smax v4.16b, v4.16b, v23.16b
+ uxtl v22.8h, v0.8b
+ uxtl2 v24.8h, v0.16b
+ saddw v28.8h, v28.8h, v4.8b
+ saddw2 v21.8h, v21.8h, v4.16b
+ ssubw v22.8h, v22.8h, v4.8b
+ ssubw2 v24.8h, v24.8h, v4.16b
+ sqxtun v16.8b, v28.8h
+ sqxtun2 v16.16b, v21.8h
+ sqxtun v0.8b, v22.8h
+ sqxtun2 v0.16b, v24.8h
+.endm
+
+function x264_deblock_v_luma_neon, export=1
+ h264_loop_filter_start
+
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v2.16b}, [x0], x1
+ ld1 {v4.16b}, [x0], x1
+ sub x0, x0, x1, lsl #2
+ sub x0, x0, x1, lsl #1
+ ld1 {v20.16b}, [x0], x1
+ ld1 {v18.16b}, [x0], x1
+ ld1 {v16.16b}, [x0], x1
+
+ h264_loop_filter_luma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v17.16b}, [x0], x1
+ st1 {v16.16b}, [x0], x1
+ st1 {v0.16b}, [x0], x1
+ st1 {v19.16b}, [x0]
+
+ ret
+endfunc
+
+function x264_deblock_h_luma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #4
+ ld1 {v6.8b}, [x0], x1
+ ld1 {v20.8b}, [x0], x1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v6.d}[1], [x0], x1
+ ld1 {v20.d}[1], [x0], x1
+ ld1 {v18.d}[1], [x0], x1
+ ld1 {v16.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+ ld1 {v4.d}[1], [x0], x1
+ ld1 {v26.d}[1], [x0], x1
+
+ transpose_8x16.b v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
+
+ h264_loop_filter_luma
+
+ transpose_4x16.b v17, v16, v0, v19, v21, v23, v25, v27
+
+ sub x0, x0, x1, lsl #4
+ add x0, x0, #2
+ st1 {v17.s}[0], [x0], x1
+ st1 {v16.s}[0], [x0], x1
+ st1 {v0.s}[0], [x0], x1
+ st1 {v19.s}[0], [x0], x1
+ st1 {v17.s}[1], [x0], x1
+ st1 {v16.s}[1], [x0], x1
+ st1 {v0.s}[1], [x0], x1
+ st1 {v19.s}[1], [x0], x1
+ st1 {v17.s}[2], [x0], x1
+ st1 {v16.s}[2], [x0], x1
+ st1 {v0.s}[2], [x0], x1
+ st1 {v19.s}[2], [x0], x1
+ st1 {v17.s}[3], [x0], x1
+ st1 {v16.s}[3], [x0], x1
+ st1 {v0.s}[3], [x0], x1
+ st1 {v19.s}[3], [x0], x1
+
+ ret
+endfunc
+
+.macro h264_loop_filter_chroma
+ dup v22.16b, w2 // alpha
+ uxtl v24.8h, v24.8b
+ uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
+ uxtl v4.8h, v0.8b
+ uxtl2 v5.8h, v0.16b
+ uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
+ usubw v4.8h, v4.8h, v16.8b
+ usubw2 v5.8h, v5.8h, v16.16b
+ sli v24.8h, v24.8h, #8
+ shl v4.8h, v4.8h, #2
+ shl v5.8h, v5.8h, #2
+ uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
+ uxtl v24.4s, v24.4h
+ uaddw v4.8h, v4.8h, v18.8b
+ uaddw2 v5.8h, v5.8h, v18.16b
+ cmhi v26.16b, v22.16b, v26.16b // < alpha
+ usubw v4.8h, v4.8h, v2.8b
+ usubw2 v5.8h, v5.8h, v2.16b
+ sli v24.4s, v24.4s, #16
+ dup v22.16b, w3 // beta
+ rshrn v4.8b, v4.8h, #3
+ rshrn2 v4.16b, v5.8h, #3
+ cmhi v28.16b, v22.16b, v28.16b // < beta
+ cmhi v30.16b, v22.16b, v30.16b // < beta
+ smin v4.16b, v4.16b, v24.16b
+ neg v25.16b, v24.16b
+ and v26.16b, v26.16b, v28.16b
+ smax v4.16b, v4.16b, v25.16b
+ and v26.16b, v26.16b, v30.16b
+ uxtl v22.8h, v0.8b
+ uxtl2 v23.8h, v0.16b
+ and v4.16b, v4.16b, v26.16b
+ uxtl v28.8h, v16.8b
+ uxtl2 v29.8h, v16.16b
+ saddw v28.8h, v28.8h, v4.8b
+ saddw2 v29.8h, v29.8h, v4.16b
+ ssubw v22.8h, v22.8h, v4.8b
+ ssubw2 v23.8h, v23.8h, v4.16b
+ sqxtun v16.8b, v28.8h
+ sqxtun v0.8b, v22.8h
+ sqxtun2 v16.16b, v29.8h
+ sqxtun2 v0.16b, v23.8h
+.endm
+
+function x264_deblock_v_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, x1, lsl #1
+ ld1 {v18.16b}, [x0], x1
+ ld1 {v16.16b}, [x0], x1
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v2.16b}, [x0]
+
+ h264_loop_filter_chroma
+
+ sub x0, x0, x1, lsl #1
+ st1 {v16.16b}, [x0], x1
+ st1 {v0.16b}, [x0], x1
+
+ ret
+endfunc
+
+function x264_deblock_h_chroma_neon, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, #4
+ ld1 {v18.d}[0], [x0], x1
+ ld1 {v16.d}[0], [x0], x1
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v2.d}[0], [x0], x1
+ ld1 {v18.d}[1], [x0], x1
+ ld1 {v16.d}[1], [x0], x1
+ ld1 {v0.d}[1], [x0], x1
+ ld1 {v2.d}[1], [x0], x1
+
+ transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
+
+ h264_loop_filter_chroma
+
+ transpose4x8.h v18, v16, v0, v2, v28, v29, v30, v31
+
+ sub x0, x0, x1, lsl #3
+ st1 {v18.d}[0], [x0], x1
+ st1 {v16.d}[0], [x0], x1
+ st1 {v0.d}[0], [x0], x1
+ st1 {v2.d}[0], [x0], x1
+ st1 {v18.d}[1], [x0], x1
+ st1 {v16.d}[1], [x0], x1
+ st1 {v0.d}[1], [x0], x1
+ st1 {v2.d}[1], [x0], x1
+
+ ret
+endfunc
+
+
+//static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE],
+// int8_t ref[2][X264_SCAN8_LUMA_SIZE],
+// int16_t mv[2][X264_SCAN8_LUMA_SIZE][2],
+// uint8_t bs[2][8][4], int mvy_limit,
+// int bframe )
+function x264_deblock_strength_neon, export=1
+ movi v4.16b, #0
+ lsl w4, w4, #8
+ add x3, x3, #32
+ sub w4, w4, #(1<<8)-3
+ movi v5.16b, #0
+ dup v6.8h, w4
+ mov x6, #-32
+
+bframe:
+ // load bytes ref
+ add x2, x2, #16
+ ld1 {v31.d}[1], [x1], #8
+ ld1 {v1.16b}, [x1], #16
+ movi v0.16b, #0
+ ld1 {v2.16b}, [x1], #16
+ ext v3.16b, v0.16b, v1.16b, #15
+ ext v0.16b, v0.16b, v2.16b, #15
+ unzip v21.4s, v22.4s, v1.4s, v2.4s
+ unzip v23.4s, v20.4s, v3.4s, v0.4s
+ ext v21.16b, v31.16b, v22.16b, #12
+
+ eor v0.16b, v20.16b, v22.16b
+ eor v1.16b, v21.16b, v22.16b
+ orr v4.16b, v4.16b, v0.16b
+ orr v5.16b, v5.16b, v1.16b
+
+ ld1 {v21.8h}, [x2], #16 // mv + 0x10
+ ld1 {v19.8h}, [x2], #16 // mv + 0x20
+ ld1 {v22.8h}, [x2], #16 // mv + 0x30
+ ld1 {v18.8h}, [x2], #16 // mv + 0x40
+ ld1 {v23.8h}, [x2], #16 // mv + 0x50
+ ext v19.16b, v19.16b, v22.16b, #12
+ ext v18.16b, v18.16b, v23.16b, #12
+ sabd v0.8h, v22.8h, v19.8h
+ ld1 {v19.8h}, [x2], #16 // mv + 0x60
+ sabd v1.8h, v23.8h, v18.8h
+ ld1 {v24.8h}, [x2], #16 // mv + 0x70
+ uqxtn v0.8b, v0.8h
+ ld1 {v18.8h}, [x2], #16 // mv + 0x80
+ ld1 {v25.8h}, [x2], #16 // mv + 0x90
+ uqxtn2 v0.16b, v1.8h
+ ext v19.16b, v19.16b, v24.16b, #12
+ ext v18.16b, v18.16b, v25.16b, #12
+ sabd v1.8h, v24.8h, v19.8h
+ sabd v2.8h, v25.8h, v18.8h
+ uqxtn v1.8b, v1.8h
+ uqxtn2 v1.16b, v2.8h
+
+ uqsub v0.16b, v0.16b, v6.16b
+ uqsub v1.16b, v1.16b, v6.16b
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+
+ sabd v1.8h, v22.8h, v23.8h
+ orr v4.16b, v4.16b, v0.16b
+
+ sabd v0.8h, v21.8h, v22.8h
+ sabd v2.8h, v23.8h, v24.8h
+ sabd v3.8h, v24.8h, v25.8h
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ uqxtn v1.8b, v2.8h
+ uqxtn2 v1.16b, v3.8h
+
+ uqsub v0.16b, v0.16b, v6.16b
+ uqsub v1.16b, v1.16b, v6.16b
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ subs w5, w5, #1
+ orr v5.16b, v5.16b, v0.16b
+ b.eq bframe
+
+ movi v6.16b, #1
+ // load bytes nnz
+ ld1 {v31.d}[1], [x0], #8
+ ld1 {v1.16b}, [x0], #16
+ movi v0.16b, #0
+ ld1 {v2.16b}, [x0], #16
+ ext v3.16b, v0.16b, v1.16b, #15
+ ext v0.16b, v0.16b, v2.16b, #15
+ unzip v21.4s, v22.4s, v1.4s, v2.4s
+ unzip v23.4s, v20.4s, v3.4s, v0.4s
+ ext v21.16b, v31.16b, v22.16b, #12
+
+ movrel x7, transpose_table
+ ld1 {v7.16b}, [x7]
+ orr v0.16b, v20.16b, v22.16b
+ orr v1.16b, v21.16b, v22.16b
+ umin v0.16b, v0.16b, v6.16b
+ umin v1.16b, v1.16b, v6.16b
+ umin v4.16b, v4.16b, v6.16b // mv ? 1 : 0
+ umin v5.16b, v5.16b, v6.16b
+ add v0.16b, v0.16b, v0.16b // nnz ? 2 : 0
+ add v1.16b, v1.16b, v1.16b
+ umax v4.16b, v4.16b, v0.16b
+ umax v5.16b, v5.16b, v1.16b
+ tbl v6.16b, {v4.16b}, v7.16b
+ st1 {v5.16b}, [x3], x6 // bs[1]
+ st1 {v6.16b}, [x3] // bs[0]
+ ret
+endfunc
+
+const transpose_table
+ .byte 0, 4, 8, 12
+ .byte 1, 5, 9, 13
+ .byte 2, 6, 10, 14
+ .byte 3, 7, 11, 15
+endconst
diff --git a/common/deblock.c b/common/deblock.c
index 6b369f2..382eb72 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -729,7 +729,7 @@ void x264_deblock_v_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int
void x264_deblock_h_luma_altivec( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
#endif // ARCH_PPC
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
void x264_deblock_v_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_neon ( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_chroma_neon( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
@@ -838,7 +838,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf, int b_mbaff )
}
#endif // HAVE_ALTIVEC
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
--
2.0.0
More information about the x264-devel
mailing list