[x264-devel] [PATCH 8/9] aarch64: intra predition NEON asm
Janne Grunau
janne-x264 at jannau.net
Sat Jul 19 20:57:54 CEST 2014
Ported from the ARM NEON asm.
---
Makefile | 4 +-
common/aarch64/predict-a.S | 662 +++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/predict-c.c | 114 ++++++++
common/aarch64/predict.h | 52 ++++
common/pixel.c | 19 +-
common/predict.c | 19 ++
6 files changed, 858 insertions(+), 12 deletions(-)
create mode 100644 common/aarch64/predict-a.S
create mode 100644 common/aarch64/predict-c.c
create mode 100644 common/aarch64/predict.h
diff --git a/Makefile b/Makefile
index b0d4a14..397b54d 100644
--- a/Makefile
+++ b/Makefile
@@ -129,8 +129,10 @@ ifneq ($(AS),)
ASMSRC += common/aarch64/dct-a.S \
common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
+ common/aarch64/predict-a.S \
common/aarch64/quant-a.S
-SRCS += common/aarch64/mc-c.c
+SRCS += common/aarch64/mc-c.c \
+ common/aarch64/predict-c.c
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
new file mode 100644
index 0000000..d3271b8
--- /dev/null
+++ b/common/aarch64/predict-a.S
@@ -0,0 +1,662 @@
+/*****************************************************************************
+ * predict.S: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Mans Rullgard <mans at mansr.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+.align 4
+
+p8weight: .short 1,2,3,4,1,2,3,4
+p16weight: .short 1,2,3,4,5,6,7,8
+
+.text
+
+.macro ldcol.8 vd, xn, xm, n=8, hi=0
+.if \n == 8 || \hi == 0
+ ld1 {\vd\().b}[0], [\xn], \xm
+ ld1 {\vd\().b}[1], [\xn], \xm
+ ld1 {\vd\().b}[2], [\xn], \xm
+ ld1 {\vd\().b}[3], [\xn], \xm
+.endif
+.if \n == 8 || \hi == 1
+ ld1 {\vd\().b}[4], [\xn], \xm
+ ld1 {\vd\().b}[5], [\xn], \xm
+ ld1 {\vd\().b}[6], [\xn], \xm
+ ld1 {\vd\().b}[7], [\xn], \xm
+.endif
+.endm
+
+.macro ldcol.16 vd, xn, xm
+ ldcol.8 \vd, \xn, \xm
+ ld1 {\vd\().b}[ 8], [\xn], \xm
+ ld1 {\vd\().b}[ 9], [\xn], \xm
+ ld1 {\vd\().b}[10], [\xn], \xm
+ ld1 {\vd\().b}[11], [\xn], \xm
+ ld1 {\vd\().b}[12], [\xn], \xm
+ ld1 {\vd\().b}[13], [\xn], \xm
+ ld1 {\vd\().b}[14], [\xn], \xm
+ ld1 {\vd\().b}[15], [\xn], \xm
+.endm
+
+
+function x264_predict_4x4_h_aarch64, export=1
+ ldrb w1, [x0, #0*FDEC_STRIDE-1]
+ ldrb w2, [x0, #1*FDEC_STRIDE-1]
+ ldrb w3, [x0, #2*FDEC_STRIDE-1]
+ ldrb w4, [x0, #3*FDEC_STRIDE-1]
+ add w1, w1, w1, lsl #8
+ add w2, w2, w2, lsl #8
+ add w3, w3, w3, lsl #8
+ add w4, w4, w4, lsl #8
+ add w1, w1, w1, lsl #16
+ str w1, [x0, #0*FDEC_STRIDE]
+ add w2, w2, w2, lsl #16
+ str w2, [x0, #1*FDEC_STRIDE]
+ add w3, w3, w3, lsl #16
+ str w3, [x0, #2*FDEC_STRIDE]
+ add w4, w4, w4, lsl #16
+ str w4, [x0, #3*FDEC_STRIDE]
+ ret
+endfunc
+
+function x264_predict_4x4_v_aarch64, export=1
+ ldr w1, [x0, #0 - 1 * FDEC_STRIDE]
+ str w1, [x0, #0 + 0 * FDEC_STRIDE]
+ str w1, [x0, #0 + 1 * FDEC_STRIDE]
+ str w1, [x0, #0 + 2 * FDEC_STRIDE]
+ str w1, [x0, #0 + 3 * FDEC_STRIDE]
+ ret
+endfunc
+
+function x264_predict_4x4_dc_neon, export=1
+ sub x1, x0, #FDEC_STRIDE
+ sub x2, x0, #1
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x1]
+ ld1r {v1.8b}, [x2], x7
+ ld1r {v2.8b}, [x2], x7
+ ld1r {v3.8b}, [x2], x7
+ ld1r {v4.8b}, [x2], x7
+ uaddlp v0.4h, v0.8b
+ uaddl v1.8h, v1.8b, v2.8b
+ uaddl v2.8h, v3.8b, v4.8b
+ addp v0.4h, v0.4h, v0.4h
+ add v1.4h, v1.4h, v2.4h
+ dup v0.4h, v0.h[0]
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #3
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0]
+ ret
+endfunc
+
+function x264_predict_4x4_dc_top_neon, export=1
+ sub x1, x0, #FDEC_STRIDE
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x1]
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ dup v0.4h, v0.h[0]
+ rshrn v0.8b, v0.8h, #2
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0], #FDEC_STRIDE
+ str s0, [x0]
+ ret
+endfunc
+
+function x264_predict_4x4_ddr_neon, export=1
+ sub x1, x0, #FDEC_STRIDE+1
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x1], x7 // # -FDEC_STRIDE-1
+ ld1r {v1.8b}, [x1], x7 // #0*FDEC_STRIDE-1
+ ld1r {v2.8b}, [x1], x7 // #1*FDEC_STRIDE-1
+ ext v0.8b, v1.8b, v0.8b, #7
+ ld1r {v3.8b}, [x1], x7 // #2*FDEC_STRIDE-1
+ ext v0.8b, v2.8b, v0.8b, #7 // a
+ ld1r {v4.8b}, [x1], x7 // #3*FDEC_STRIDE-1
+ ext v1.8b, v3.8b, v0.8b, #7 // b
+ ext v2.8b, v4.8b, v1.8b, #7 // c
+ uaddl v0.8h, v0.8b, v1.8b
+ uaddl v1.8h, v1.8b, v2.8b
+ add v0.8h, v0.8h, v1.8h
+ rshrn v0.8b, v0.8h, #2
+
+ ext v3.8b, v0.8b, v0.8b, #3
+ ext v2.8b, v0.8b, v0.8b, #2
+ ext v1.8b, v0.8b, v0.8b, #1
+
+ str s3, [x0], #FDEC_STRIDE
+ str s2, [x0], #FDEC_STRIDE
+ str s1, [x0], #FDEC_STRIDE
+ str s0, [x0]
+ ret
+endfunc
+
+function x264_predict_4x4_ddl_neon, export=1
+ sub x0, x0, #FDEC_STRIDE
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x0], x7
+ dup v3.8b, v0.b[7]
+ ext v1.8b, v0.8b, v0.8b, #1
+ ext v2.8b, v0.8b, v3.8b, #2
+ uhadd v0.8b, v0.8b, v2.8b
+ urhadd v0.8b, v0.8b, v1.8b
+ str s0, [x0], #FDEC_STRIDE
+ ext v1.8b, v0.8b, v0.8b, #1
+ ext v2.8b, v0.8b, v0.8b, #2
+ str s1, [x0], #FDEC_STRIDE
+ ext v3.8b, v0.8b, v0.8b, #3
+ str s2, [x0], #FDEC_STRIDE
+ str s3, [x0]
+ ret
+endfunc
+
+function x264_predict_8x8_dc_neon, export=1
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.16b}, [x1], #16
+ ld1 {v1.8b}, [x1]
+ ext v0.16b, v0.16b, v0.16b, #7
+ uaddlv h1, v1.8b
+ uaddlv h0, v0.8b
+ add v0.8h, v0.8h, v1.8h
+ dup v0.8h, v0.h[0]
+ rshrn v0.8b, v0.8h, #4
+.rept 8
+ st1 {v0.8b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_8x8_h_neon, export=1
+ mov x7, #FDEC_STRIDE
+ ld1 {v16.16b}, [x1]
+ dup v0.8b, v16.b[14]
+ dup v1.8b, v16.b[13]
+ st1 {v0.8b}, [x0], x7
+ dup v2.8b, v16.b[12]
+ st1 {v1.8b}, [x0], x7
+ dup v3.8b, v16.b[11]
+ st1 {v2.8b}, [x0], x7
+ dup v4.8b, v16.b[10]
+ st1 {v3.8b}, [x0], x7
+ dup v5.8b, v16.b[9]
+ st1 {v4.8b}, [x0], x7
+ dup v6.8b, v16.b[8]
+ st1 {v5.8b}, [x0], x7
+ dup v7.8b, v16.b[7]
+ st1 {v6.8b}, [x0], x7
+ st1 {v7.8b}, [x0], x7
+ ret
+endfunc
+
+function x264_predict_8x8_v_neon, export=1
+ add x1, x1, #16
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x1]
+.rept 8
+ st1 {v0.8b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_8x8_ddl_neon, export=1
+ add x1, x1, #16
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.16b}, [x1]
+ movi v3.16b, #0
+ dup v2.16b, v0.b[15]
+ ext v4.16b, v3.16b, v0.16b, #15
+ ext v2.16b, v0.16b, v2.16b, #1
+ uhadd v4.16b, v4.16b, v2.16b
+ urhadd v0.16b, v0.16b, v4.16b
+ ext v1.16b, v0.16b, v0.16b, #1
+ ext v2.16b, v0.16b, v0.16b, #2
+ st1 {v1.8b}, [x0], x7
+ ext v3.16b, v0.16b, v0.16b, #3
+ st1 {v2.8b}, [x0], x7
+ ext v4.16b, v0.16b, v0.16b, #4
+ st1 {v3.8b}, [x0], x7
+ ext v5.16b, v0.16b, v0.16b, #5
+ st1 {v4.8b}, [x0], x7
+ ext v6.16b, v0.16b, v0.16b, #6
+ st1 {v5.8b}, [x0], x7
+ ext v7.16b, v0.16b, v0.16b, #7
+ st1 {v6.8b}, [x0], x7
+ ext v0.16b, v0.16b, v0.16b, #8
+ st1 {v7.8b}, [x0], x7
+ st1 {v0.8b}, [x0], x7
+ ret
+endfunc
+
+function x264_predict_8x8_ddr_neon, export=1
+ ld1 {v0.16b,v1.16b}, [x1]
+ ext v2.16b, v0.16b, v1.16b, #7
+ ext v4.16b, v0.16b, v1.16b, #9
+ ext v3.16b, v0.16b, v1.16b, #8
+
+ uhadd v2.16b, v2.16b, v4.16b
+ urhadd v7.16b, v3.16b, v2.16b
+
+ add x0, x0, #7*FDEC_STRIDE
+ mov x7, #-1*FDEC_STRIDE
+
+ ext v6.16b, v7.16b, v7.16b, #1
+ st1 {v7.8b}, [x0], x7
+ ext v5.16b, v7.16b, v7.16b, #2
+ st1 {v6.8b}, [x0], x7
+ ext v4.16b, v7.16b, v7.16b, #3
+ st1 {v5.8b}, [x0], x7
+ ext v3.16b, v7.16b, v7.16b, #4
+ st1 {v4.8b}, [x0], x7
+ ext v2.16b, v7.16b, v7.16b, #5
+ st1 {v3.8b}, [x0], x7
+ ext v1.16b, v7.16b, v7.16b, #6
+ st1 {v2.8b}, [x0], x7
+ ext v0.16b, v7.16b, v7.16b, #7
+ st1 {v1.8b}, [x0], x7
+ st1 {v0.8b}, [x0], x7
+ ret
+endfunc
+
+function x264_predict_8x8_vl_neon, export=1
+ add x1, x1, #16
+ mov x7, #FDEC_STRIDE
+
+ ld1 {v0.16b}, [x1]
+ ext v1.16b, v1.16b, v0.16b, #15
+ ext v2.16b, v0.16b, v2.16b, #1
+
+ uhadd v1.16b, v1.16b, v2.16b
+ urhadd v3.16b, v0.16b, v2.16b
+
+ urhadd v0.16b, v0.16b, v1.16b
+
+ ext v4.16b, v0.16b, v0.16b, #1
+ st1 {v3.8b}, [x0], x7
+ ext v5.16b, v3.16b, v3.16b, #1
+ st1 {v4.8b}, [x0], x7
+ ext v6.16b, v0.16b, v0.16b, #2
+ st1 {v5.8b}, [x0], x7
+ ext v7.16b, v3.16b, v3.16b, #2
+ st1 {v6.8b}, [x0], x7
+ ext v4.16b, v0.16b, v0.16b, #3
+ st1 {v7.8b}, [x0], x7
+ ext v5.16b, v3.16b, v3.16b, #3
+ st1 {v4.8b}, [x0], x7
+ ext v6.16b, v0.16b, v0.16b, #4
+ st1 {v5.8b}, [x0], x7
+ st1 {v6.8b}, [x0], x7
+ ret
+endfunc
+
+function x264_predict_8x8_vr_neon, export=1
+ add x1, x1, #8
+ mov x7, #FDEC_STRIDE
+ ld1 {v2.16b}, [x1]
+
+ ext v1.16b, v2.16b, v2.16b, #14
+ ext v0.16b, v2.16b, v2.16b, #15
+
+ uhadd v3.16b, v2.16b, v1.16b
+ urhadd v2.16b, v2.16b, v0.16b
+ urhadd v0.16b, v0.16b, v3.16b
+
+ ext v1.16b, v2.16b, v2.16b, #8
+ uzp1 v2.8b, v0.8b, v0.8b
+ uzp2 v3.8b, v0.8b, v0.8b
+ ext v0.16b, v0.16b, v0.16b, #8
+
+ st1 {v1.8b}, [x0], x7
+ st1 {v0.8b}, [x0], x7
+ ext v4.8b, v3.8b, v1.8b, #7
+ ext v5.8b, v2.8b, v0.8b, #7
+ st1 {v4.8b}, [x0], x7
+ st1 {v5.8b}, [x0], x7
+ ext v6.8b, v3.8b, v1.8b, #6
+ ext v7.8b, v2.8b, v0.8b, #6
+ st1 {v6.8b}, [x0], x7
+ st1 {v7.8b}, [x0], x7
+ ext v1.8b, v3.8b, v1.8b, #5
+ ext v0.8b, v2.8b, v0.8b, #5
+ st1 {v1.8b}, [x0], x7
+ st1 {v0.8b}, [x0], x7
+ ret
+endfunc
+
+function x264_predict_8x8_hd_neon, export=1
+ add x1, x1, #7
+ mov x7, #FDEC_STRIDE
+
+ ld1 {v1.16b}, [x1]
+ ext v3.16b, v1.16b, v1.16b, #1
+ ext v2.16b, v1.16b, v1.16b, #2
+
+ urhadd v4.16b, v1.16b, v3.16b
+
+ uhadd v1.16b, v1.16b, v2.16b
+ urhadd v0.16b, v1.16b, v3.16b
+
+ zip1 v16.8b, v4.8b, v0.8b
+ zip2 v17.8b, v4.8b, v0.8b
+ ext v7.16b, v0.16b, v0.16b, #8
+
+ ext v0.8b, v17.8b, v7.8b, #6
+ ext v1.8b, v17.8b, v7.8b, #4
+ st1 {v0.8b}, [x0], x7
+ ext v2.8b, v17.8b, v7.8b, #2
+ st1 {v1.8b}, [x0], x7
+ st1 {v2.8b}, [x0], x7
+ ext v3.8b, v16.8b, v17.8b, #6
+ st1 {v17.8b}, [x0], x7
+ ext v4.8b, v16.8b, v17.8b, #4
+ st1 {v3.8b}, [x0], x7
+ ext v5.8b, v16.8b, v17.8b, #2
+ st1 {v4.8b}, [x0], x7
+ st1 {v5.8b}, [x0], x7
+ st1 {v16.8b}, [x0], x7
+
+ ret
+endfunc
+
+function x264_predict_8x8_hu_neon, export=1
+ add x1, x1, #7
+ mov x7, #FDEC_STRIDE
+ ld1 {v7.8b}, [x1]
+ dup v6.8b, v7.b[0]
+ rev64 v7.8b, v7.8b
+
+ ext v4.8b, v7.8b, v6.8b, #2
+ ext v2.8b, v7.8b, v6.8b, #1
+
+ uhadd v5.8b, v7.8b, v4.8b
+ urhadd v0.8b, v2.8b, v7.8b
+ urhadd v1.8b, v5.8b, v2.8b
+
+ zip1 v16.8b, v0.8b, v1.8b
+ zip2 v17.8b, v0.8b, v1.8b
+
+ dup v18.4h, v17.h[3]
+
+ ext v0.8b, v16.8b, v17.8b, #2
+ ext v1.8b, v16.8b, v17.8b, #4
+ ext v2.8b, v16.8b, v17.8b, #6
+ st1 {v16.8b}, [x0], x7
+ st1 {v0.8b}, [x0], x7
+ st1 {v1.8b}, [x0], x7
+ st1 {v2.8b}, [x0], x7
+
+ ext v4.8b, v17.8b, v18.8b, #2
+ ext v5.8b, v17.8b, v18.8b, #4
+ ext v6.8b, v17.8b, v18.8b, #6
+ st1 {v17.8b}, [x0], x7
+ st1 {v4.8b}, [x0], x7
+ st1 {v5.8b}, [x0], x7
+ st1 {v6.8b}, [x0]
+ ret
+endfunc
+
+
+function x264_predict_8x8c_dc_top_neon, export=1
+ sub x2, x0, #FDEC_STRIDE
+ mov x1, #FDEC_STRIDE
+ ld1 {v0.8b}, [x2]
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ rshrn v0.8b, v0.8h, #2
+ dup v3.8b, v0.b[1]
+ dup v2.8b, v0.b[0]
+ transpose v0.2s, v1.2s, v2.2s, v3.2s
+ b pred8x8c_dc_end
+endfunc
+
+function x264_predict_8x8c_dc_left_neon, export=1
+ sub x2, x0, #1
+ mov x1, #FDEC_STRIDE
+ ldcol.8 v0, x2, x1
+ uaddlp v0.4h, v0.8b
+ addp v0.4h, v0.4h, v0.4h
+ rshrn v0.8b, v0.8h, #2
+ dup v1.8b, v0.b[1]
+ dup v0.8b, v0.b[0]
+ b pred8x8c_dc_end
+endfunc
+
+function x264_predict_8x8c_dc_neon, export=1
+ sub x2, x0, #FDEC_STRIDE
+ sub x3, x0, #1
+ mov x1, #FDEC_STRIDE
+ ld1 {v2.8b}, [x2]
+ ldcol.8 v3, x3, x1
+ transpose v0.2s, v1.2s, v2.2s, v3.2s
+ uaddlp v0.4h, v0.8b // s0, s2
+ uaddlp v1.4h, v1.8b // s1, s3
+ addp v0.4h, v0.4h, v1.4h // s0, s2, s1, s3
+ addp v1.4h, v0.4h, v0.4h
+ rshrn v2.8b, v0.8h, #2
+ rshrn v3.8b, v1.8h, #3
+ dup v5.8b, v2.b[2] // dc1
+ dup v6.8b, v3.b[1] // dc2
+ dup v4.8b, v3.b[0] // dc0
+ dup v7.8b, v2.b[3] // dc3
+ trn1 v0.2s, v4.2s, v5.2s
+ trn1 v1.2s, v7.2s, v6.2s
+pred8x8c_dc_end:
+ add x2, x0, x1, lsl #2
+.rept 4
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x2], x1
+.endr
+ ret
+endfunc
+
+function x264_predict_8x8c_h_neon, export=1
+ sub x1, x0, #1
+ mov x7, #FDEC_STRIDE
+.rept 4
+ ld1r {v0.8b}, [x1], x7
+ ld1r {v1.8b}, [x1], x7
+ st1 {v0.8b}, [x0], x7
+ st1 {v1.8b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_8x8c_v_neon, export=1
+ sub x0, x0, #FDEC_STRIDE
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.8b}, [x0], x7
+.rept 8
+ st1 {v0.8b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_8x8c_p_neon, export=1
+ sub x3, x0, #FDEC_STRIDE
+ mov x1, #FDEC_STRIDE
+ add x2, x3, #4
+ sub x3, x3, #1
+ ld1 {v0.s}[0], [x3]
+ ld1 {v2.s}[0], [x2], x1
+ ldcol.8 v0, x3, x1, 4, hi=1
+ add x3, x3, x1
+ ldcol.8 v3, x3, x1, 4
+ movrel x4, p8weight
+ movrel x5, p16weight
+ uaddl v4.8h, v2.8b, v3.8b
+ rev32 v0.8b, v0.8b
+ trn1 v2.2s, v2.2s, v3.2s
+ ld1 {v7.8h}, [x4]
+ usubl v2.8h, v2.8b, v0.8b
+ mul v2.8h, v2.8h, v7.8h
+ ld1 {v0.8h}, [x5]
+ saddlp v2.4s, v2.8h
+ addp v2.4s, v2.4s, v2.4s
+ shl v3.2s, v2.2s, #4
+ add v2.2s, v2.2s, v3.2s
+ rshrn v5.4h, v2.4s, #5 // b, c, x, x
+ addp v2.4h, v5.4h, v5.4h
+ shl v3.4h, v2.4h, #2
+ sub v3.4h, v3.4h, v2.4h // 3 * (b + c)
+ rev64 v4.4h, v4.4h
+ add v4.4h, v4.4h, v0.4h
+ shl v2.4h, v4.4h, #4 // a
+ sub v2.4h, v2.4h, v3.4h // a - 3 * (b + c) + 16
+ ext v0.16b, v0.16b, v0.16b, #14
+ sub v6.4h, v5.4h, v3.4h
+ mov v0.h[0], wzr
+ mul v0.8h, v0.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
+ dup v1.8h, v2.h[0] // pix
+ dup v2.8h, v5.h[1] // c
+ add v1.8h, v1.8h, v0.8h // pix + x*b
+ mov x3, #8
+1:
+ subs x3, x3, #1
+ sqshrun v0.8b, v1.8h, #5
+ add v1.8h, v1.8h, v2.8h
+ st1 {v0.8b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
+
+
+function x264_predict_16x16_dc_top_neon, export=1
+ sub x2, x0, #FDEC_STRIDE
+ mov x1, #FDEC_STRIDE
+ ld1 {v0.16b}, [x2]
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ b pred16x16_dc_end
+endfunc
+
+function x264_predict_16x16_dc_left_neon, export=1
+ sub x2, x0, #1
+ mov x1, #FDEC_STRIDE
+ ldcol.16 v0, x2, x1
+ uaddlv h0, v0.16b
+ rshrn v0.8b, v0.8h, #4
+ dup v0.16b, v0.b[0]
+ b pred16x16_dc_end
+endfunc
+
+function x264_predict_16x16_dc_neon, export=1
+ sub x3, x0, #FDEC_STRIDE
+ sub x2, x0, #1
+ mov x1, #FDEC_STRIDE
+ ld1 {v0.16b}, [x3]
+ ldcol.16 v1, x2, x1
+ uaddlv h0, v0.16b
+ uaddlv h1, v1.16b
+ add v0.4h, v0.4h, v1.4h
+ rshrn v0.8b, v0.8h, #5
+ dup v0.16b, v0.b[0]
+pred16x16_dc_end:
+.rept 16
+ st1 {v0.16b}, [x0], x1
+.endr
+ ret
+endfunc
+
+function x264_predict_16x16_h_neon, export=1
+ sub x1, x0, #1
+ mov x7, #FDEC_STRIDE
+.rept 8
+ ld1r {v0.16b}, [x1], x7
+ ld1r {v1.16b}, [x1], x7
+ st1 {v0.16b}, [x0], x7
+ st1 {v1.16b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_16x16_v_neon, export=1
+ sub x0, x0, #FDEC_STRIDE
+ mov x7, #FDEC_STRIDE
+ ld1 {v0.16b}, [x0], x7
+.rept 16
+ st1 {v0.16b}, [x0], x7
+.endr
+ ret
+endfunc
+
+function x264_predict_16x16_p_neon, export=1
+ sub x3, x0, #FDEC_STRIDE
+ mov x1, #FDEC_STRIDE
+ add x2, x3, #8
+ sub x3, x3, #1
+ ld1 {v0.8b}, [x3]
+ ld1 {v2.8b}, [x2], x1
+ ldcol.8 v1, x3, x1
+ add x3, x3, x1
+ ldcol.8 v3, x3, x1
+ rev64 v0.8b, v0.8b
+ rev64 v1.8b, v1.8b
+ movrel x4, p16weight
+ uaddl v4.8h, v2.8b, v3.8b
+ ld1 {v7.8h}, [x4]
+ usubl v2.8h, v2.8b, v0.8b
+ usubl v3.8h, v3.8b, v1.8b
+ mul v2.8h, v2.8h, v7.8h
+ mul v3.8h, v3.8h, v7.8h
+ saddlp v2.4s, v2.8h
+ saddlp v3.4s, v3.8h
+ addp v2.4s, v2.4s, v3.4s
+ addp v2.4s, v2.4s, v2.4s
+ shl v3.2s, v2.2s, #2
+ add v2.2s, v2.2s, v3.2s
+ rshrn v5.4h, v2.4s, #6 // b, c, x, x
+ addp v2.4h, v5.4h, v5.4h
+ shl v3.4h, v2.4h, #3
+ sub v3.4h, v3.4h, v2.4h // 7 * (b + c)
+ ext v4.16b, v4.16b, v4.16b, #14
+ add v4.4h, v4.4h, v7.4h
+ shl v2.4h, v4.4h, #4 // a
+ sub v2.4h, v2.4h, v3.4h // a - 7 * (b + c) + 16
+ ext v7.16b, v7.16b, v7.16b, #14
+ mov v7.h[0], wzr
+ dup v3.8h, v5.h[0]
+ mul v0.8h, v7.8h, v5.h[0] // 0,1,2,3,4,5,6,7 * b
+ dup v1.8h, v2.h[0] // pix
+ dup v2.8h, v5.h[1] // c
+ shl v3.8h, v3.8h, #3
+ add v1.8h, v1.8h, v0.8h // pix + x*b
+ add v3.8h, v3.8h, v1.8h // pix + x{8-15}*b
+ mov x3, #16
+1:
+ subs x3, x3, #1
+ sqshrun v0.8b, v1.8h, #5
+ add v1.8h, v1.8h, v2.8h
+ sqshrun2 v0.16b, v3.8h, #5
+ add v3.8h, v3.8h, v2.8h
+ st1 {v0.16b}, [x0], x1
+ b.ne 1b
+ ret
+endfunc
diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c
new file mode 100644
index 0000000..3803b57
--- /dev/null
+++ b/common/aarch64/predict-c.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+ * predict.c: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+void x264_predict_4x4_ddr_neon( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+void x264_predict_16x16_p_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
+{
+#if !HIGH_BIT_DEPTH
+ if (cpu&X264_CPU_ARMV8)
+ {
+ pf[I_PRED_4x4_H] = x264_predict_4x4_h_aarch64;
+ pf[I_PRED_4x4_V] = x264_predict_4x4_v_aarch64;
+ }
+
+ if (cpu&X264_CPU_NEON)
+ {
+ pf[I_PRED_4x4_DC] = x264_predict_4x4_dc_neon;
+ pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
+ pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+ pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_neon;
+ }
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x8c_dc_neon;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x8c_dc_top_neon;
+ pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
+ pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
+ pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_16x16_DC ] = x264_predict_16x16_dc_neon;
+ pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
+ pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
+ pf[I_PRED_16x16_H ] = x264_predict_16x16_h_neon;
+ pf[I_PRED_16x16_V ] = x264_predict_16x16_v_neon;
+ pf[I_PRED_16x16_P ] = x264_predict_16x16_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h
new file mode 100644
index 0000000..2d26a05
--- /dev/null
+++ b/common/aarch64/predict.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * predict.h: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_PREDICT_H
+#define X264_AARCH64_PREDICT_H
+
+void x264_predict_4x4_h_aarch64( uint8_t *src );
+void x264_predict_4x4_v_aarch64( uint8_t *src );
+
+// for the merged 4x4 intra sad/satd which expects unified suffix
+#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
+#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
+
+void x264_predict_4x4_dc_neon( uint8_t *src );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_dc_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
+
+#endif /* X264_AARCH64_PREDICT_H */
diff --git a/common/pixel.c b/common/pixel.c
index 3a8333d..9f603f2 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -40,6 +40,7 @@
#endif
#if ARCH_AARCH64
# include "aarch64/pixel.h"
+# include "aarch64/predict.h"
#endif
@@ -523,14 +524,10 @@ INTRA_MBCMP_8x8(sa8d,, _c )
INTRA_MBCMP_8x8( sad, _mmx2, _c )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
-#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64)
INTRA_MBCMP_8x8( sad, _neon, _neon )
INTRA_MBCMP_8x8(sa8d, _neon, _neon )
#endif
-#if !HIGH_BIT_DEPTH && ARCH_AARCH64
-INTRA_MBCMP_8x8( sad, _neon, _c )
-INTRA_MBCMP_8x8(sa8d, _neon, _c )
-#endif
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
@@ -597,14 +594,14 @@ INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
#endif
#if !HIGH_BIT_DEPTH && ARCH_AARCH64
-INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c )
-INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c )
-INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _c )
-INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _c )
+INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _neon )
+INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _neon )
+INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon )
+INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon )
INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c )
-INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _c )
-INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _c )
+INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
+INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
diff --git a/common/predict.c b/common/predict.c
index cbc018d..f9c4615 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -40,6 +40,9 @@
#if ARCH_ARM
# include "arm/predict.h"
#endif
+#if ARCH_AARCH64
+# include "aarch64/predict.h"
+#endif
/****************************************************************************
* 16x16 prediction for intra luma block
@@ -899,6 +902,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
#if HAVE_ARMV6
x264_predict_16x16_init_arm( cpu, pf );
#endif
+
+#if ARCH_AARCH64
+ x264_predict_16x16_init_aarch64( cpu, pf );
+#endif
}
void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -923,6 +930,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
#if HAVE_ARMV6
x264_predict_8x8c_init_arm( cpu, pf );
#endif
+
+#if ARCH_AARCH64
+ x264_predict_8x8c_init_aarch64( cpu, pf );
+#endif
}
void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
@@ -963,6 +974,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_
#if HAVE_ARMV6
x264_predict_8x8_init_arm( cpu, pf, predict_filter );
#endif
+
+#if ARCH_AARCH64
+ x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
+#endif
}
void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -987,5 +1002,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
#if HAVE_ARMV6
x264_predict_4x4_init_arm( cpu, pf );
#endif
+
+#if ARCH_AARCH64
+ x264_predict_4x4_init_aarch64( cpu, pf );
+#endif
}
--
2.0.0
More information about the x264-devel
mailing list