[x264-devel] [PATCH 4/9] aarch64: pixel metrics NEON asm
Janne Grunau
janne-x264 at jannau.net
Sat Jul 19 20:57:50 CEST 2014
Ported from the ARM NEON asm.
---
Makefile | 2 +-
common/aarch64/asm.S | 14 +
common/aarch64/pixel-a.S | 1162 ++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/pixel.h | 72 +++
common/pixel.c | 59 ++-
5 files changed, 1307 insertions(+), 2 deletions(-)
create mode 100644 common/aarch64/pixel-a.S
create mode 100644 common/aarch64/pixel.h
diff --git a/Makefile b/Makefile
index 8bc7f9a..d68d3d8 100644
--- a/Makefile
+++ b/Makefile
@@ -126,7 +126,7 @@ endif
# AArch64 NEON optims
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
-ASMSRC +=
+ASMSRC += common/aarch64/pixel-a.S
SRCS +=
OBJASM = $(ASMSRC:%.S=%.o)
endif
diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S
index 1c1012d..d28f263 100644
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -90,3 +90,17 @@ ELF .size \name, . - \name
#define GLUE(a, b) a ## b
#define JOIN(a, b) GLUE(a, b)
#define X(s) JOIN(EXTERN_ASM, s)
+
+#define FDEC_STRIDE 32
+#define FENC_STRIDE 16
+
+
+.macro SUMSUB_AB sum, sub, a, b
+ add \sum, \a, \b
+ sub \sub, \a, \b
+.endm
+
+.macro transpose t1, t2, s1, s2
+ trn1 \t1, \s1, \s2
+ trn2 \t2, \s1, \s2
+.endm
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
new file mode 100644
index 0000000..34edf48
--- /dev/null
+++ b/common/aarch64/pixel-a.S
@@ -0,0 +1,1162 @@
+/*****************************************************************************
+ * pixel.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+const mask
+.rept 16
+.byte 0xff
+.endr
+.rept 16
+.byte 0x00
+.endr
+endconst
+
+const mask_ac_4_8
+.short 0, -1, -1, -1, 0, -1, -1, -1
+.short 0, -1, -1, -1, -1, -1, -1, -1
+endconst
+
+.macro SAD_START_4
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ uabdl v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ uabal v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_START_8
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ uabdl v16.8h, v0.8b, v1.8b
+ uabdl v17.8h, v2.8b, v3.8b
+.endm
+
+.macro SAD_8
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ uabal v16.8h, v0.8b, v1.8b
+ uabal v17.8h, v2.8b, v3.8b
+.endm
+
+.macro SAD_START_16
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v3.16b}, [x2], x3
+ ld1 {v2.16b}, [x0], x1
+ uabdl v16.8h, v0.8b, v1.8b
+ uabdl2 v17.8h, v0.16b, v1.16b
+ uabal v16.8h, v2.8b, v3.8b
+ uabal2 v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_16
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v3.16b}, [x2], x3
+ ld1 {v2.16b}, [x0], x1
+ uabal v16.8h, v0.8b, v1.8b
+ uabal2 v17.8h, v0.16b, v1.16b
+ uabal v16.8h, v2.8b, v3.8b
+ uabal2 v17.8h, v2.16b, v3.16b
+.endm
+
+.macro SAD_FUNC w, h, name, align:vararg
+function x264_pixel_sad\name\()_\w\()x\h\()_neon, export=1
+ SAD_START_\w
+
+.rept \h / 2 - 1
+ SAD_\w
+.endr
+.if \w > 4
+ add v16.8h, v16.8h, v17.8h
+.endif
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+SAD_FUNC 4, 4
+SAD_FUNC 4, 8
+SAD_FUNC 8, 4
+SAD_FUNC 8, 8
+SAD_FUNC 8, 16
+SAD_FUNC 16, 8
+SAD_FUNC 16, 16
+
+.macro SAD_X_4 x, first=uabal
+ ld1 {v0.s}[0], [x0], x7
+ ld1 {v1.s}[0], [x1], x5
+ ld1 {v0.s}[1], [x0], x7
+ ld1 {v1.s}[1], [x1], x5
+ \first v16.8h, v1.8b, v0.8b
+ ld1 {v2.s}[0], [x2], x5
+ ld1 {v2.s}[1], [x2], x5
+ \first v17.8h, v2.8b, v0.8b
+ ld1 {v3.s}[0], [x3], x5
+ ld1 {v3.s}[1], [x3], x5
+ \first v18.8h, v3.8b, v0.8b
+.if \x == 4
+ ld1 {v4.s}[0], [x4], x5
+ ld1 {v4.s}[1], [x4], x5
+ \first v19.8h, v4.8b, v0.8b
+.endif
+.endm
+
+.macro SAD_X_8 x, first=uabal
+ ld1 {v0.8b}, [x0], x7
+ ld1 {v1.8b}, [x1], x5
+ \first v16.8h, v1.8b, v0.8b
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v5.8b}, [x0], x7
+ \first v17.8h, v2.8b, v0.8b
+ ld1 {v3.8b}, [x3], x5
+ ld1 {v1.8b}, [x1], x5
+ \first v18.8h, v3.8b, v0.8b
+ uabal v16.8h, v1.8b, v5.8b
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v3.8b}, [x3], x5
+ uabal v17.8h, v2.8b, v5.8b
+ uabal v18.8h, v3.8b, v5.8b
+.if \x == 4
+ ld1 {v4.8b}, [x4], x5
+ \first v19.8h, v4.8b, v0.8b
+ ld1 {v4.8b}, [x4], x5
+ uabal v19.8h, v4.8b, v5.8b
+.endif
+.endm
+
+.macro SAD_X_16 x, first=uabal
+ ld1 {v0.16b}, [x0], x7
+ ld1 {v1.16b}, [x1], x5
+ \first v16.8h, v1.8b, v0.8b
+ \first\()2 v20.8h, v1.16b, v0.16b
+ ld1 {v2.16b}, [x2], x5
+ ld1 {v5.16b}, [x0], x7
+ \first v17.8h, v2.8b, v0.8b
+ \first\()2 v21.8h, v2.16b, v0.16b
+ ld1 {v3.16b}, [x3], x5
+ ld1 {v1.16b}, [x1], x5
+ \first v18.8h, v3.8b, v0.8b
+ \first\()2 v22.8h, v3.16b, v0.16b
+ uabal v16.8h, v1.8b, v5.8b
+ uabal2 v20.8h, v1.16b, v5.16b
+ ld1 {v2.16b}, [x2], x5
+ ld1 {v3.16b}, [x3], x5
+ uabal v17.8h, v2.8b, v5.8b
+ uabal2 v21.8h, v2.16b, v5.16b
+ uabal v18.8h, v3.8b, v5.8b
+ uabal2 v22.8h, v3.16b, v5.16b
+.if \x == 4
+ ld1 {v4.16b}, [x4], x5
+ \first v19.8h, v4.8b, v0.8b
+ \first\()2 v23.8h, v4.16b, v0.16b
+ ld1 {v4.16b}, [x4], x5
+ uabal v19.8h, v4.8b, v5.8b
+ uabal2 v23.8h, v4.16b, v5.16b
+.endif
+.endm
+
+.macro SAD_X_FUNC x, w, h
+function x264_pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
+.if \x == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+ mov x7, #FENC_STRIDE
+
+ SAD_X_\w \x, uabdl
+
+.rept \h / 2 - 1
+ SAD_X_\w \x
+.endr
+
+.if \w > 8
+ add v16.8h, v16.8h, v20.8h
+ add v17.8h, v17.8h, v21.8h
+ add v18.8h, v18.8h, v22.8h
+.if \x == 4
+ add v19.8h, v19.8h, v23.8h
+.endif
+.endif
+// add up the sads
+ uaddlv s0, v16.8h
+ uaddlv s1, v17.8h
+ uaddlv s2, v18.8h
+
+ stp s0, s1, [x6], #8
+.if \x == 3
+ str s2, [x6]
+.else
+ uaddlv s3, v19.8h
+ stp s2, s3, [x6]
+.endif
+ ret
+endfunc
+.endm
+
+SAD_X_FUNC 3, 4, 4
+SAD_X_FUNC 3, 4, 8
+SAD_X_FUNC 3, 8, 4
+SAD_X_FUNC 3, 8, 8
+SAD_X_FUNC 3, 8, 16
+SAD_X_FUNC 3, 16, 8
+SAD_X_FUNC 3, 16, 16
+
+SAD_X_FUNC 4, 4, 4
+SAD_X_FUNC 4, 4, 8
+SAD_X_FUNC 4, 8, 4
+SAD_X_FUNC 4, 8, 8
+SAD_X_FUNC 4, 8, 16
+SAD_X_FUNC 4, 16, 8
+SAD_X_FUNC 4, 16, 16
+
+
+.macro SSD_START_4
+ ld1 {v16.s}[0], [x0], x1
+ ld1 {v17.s}[0], [x2], x3
+ usubl v2.8h, v16.8b, v17.8b
+ ld1 {v16.s}[0], [x0], x1
+ ld1 {v17.s}[0], [x2], x3
+ smull v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_4
+ usubl v2.8h, v16.8b, v17.8b
+ ld1 {v16.s}[0], [x0], x1
+ ld1 {v17.s}[0], [x2], x3
+ smlal v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_END_4
+ usubl v2.8h, v16.8b, v17.8b
+ smlal v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_START_8
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x2], x3
+ usubl v2.8h, v16.8b, v17.8b
+ ld1 {v16.8b}, [x0], x1
+ smull v0.4s, v2.4h, v2.4h
+ ld1 {v17.8b}, [x2], x3
+ smlal2 v0.4s, v2.8h, v2.8h
+.endm
+
+.macro SSD_8
+ usubl v2.8h, v16.8b, v17.8b
+ ld1 {v16.8b}, [x0], x1
+ smlal v0.4s, v2.4h, v2.4h
+ ld1 {v17.8b}, [x2], x3
+ smlal2 v0.4s, v2.8h, v2.8h
+.endm
+
+.macro SSD_END_8
+ usubl v2.8h, v16.8b, v17.8b
+ smlal v0.4s, v2.4h, v2.4h
+ smlal2 v0.4s, v2.8h, v2.8h
+.endm
+
+.macro SSD_START_16
+ ld1 {v16.16b}, [x0], x1
+ ld1 {v17.16b}, [x2], x3
+ usubl v2.8h, v16.8b, v17.8b
+ usubl2 v3.8h, v16.16b, v17.16b
+ ld1 {v16.16b}, [x0], x1
+ smull v0.4s, v2.4h, v2.4h
+ smull2 v1.4s, v2.8h, v2.8h
+ ld1 {v17.16b}, [x2], x3
+ smlal v0.4s, v3.4h, v3.4h
+ smlal2 v1.4s, v3.8h, v3.8h
+.endm
+
+.macro SSD_16
+ usubl v2.8h, v16.8b, v17.8b
+ usubl2 v3.8h, v16.16b, v17.16b
+ ld1 {v16.16b}, [x0], x1
+ smlal v0.4s, v2.4h, v2.4h
+ smlal2 v1.4s, v2.8h, v2.8h
+ ld1 {v17.16b}, [x2], x3
+ smlal v0.4s, v3.4h, v3.4h
+ smlal2 v1.4s, v3.8h, v3.8h
+.endm
+
+.macro SSD_END_16
+ usubl v2.8h, v16.8b, v17.8b
+ usubl2 v3.8h, v16.16b, v17.16b
+ smlal v0.4s, v2.4h, v2.4h
+ smlal2 v1.4s, v2.8h, v2.8h
+ smlal v0.4s, v3.4h, v3.4h
+ smlal2 v1.4s, v3.8h, v3.8h
+ add v0.4s, v0.4s, v1.4s
+.endm
+
+.macro SSD_FUNC w h
+function x264_pixel_ssd_\w\()x\h\()_neon, export=1
+ SSD_START_\w
+.rept \h-2
+ SSD_\w
+.endr
+ SSD_END_\w
+
+ addv s0, v0.4s
+ mov w0, v0.s[0]
+ ret
+endfunc
+.endm
+
+SSD_FUNC 4, 4
+SSD_FUNC 4, 8
+SSD_FUNC 8, 4
+SSD_FUNC 8, 8
+SSD_FUNC 8, 16
+SSD_FUNC 16, 8
+SSD_FUNC 16, 16
+
+.macro pixel_var_8 h
+function x264_pixel_var_8x\h\()_neon, export=1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x0], x1
+ mov x2, \h - 4
+ umull v1.8h, v16.8b, v16.8b
+ uxtl v0.8h, v16.8b
+ umull v2.8h, v17.8b, v17.8b
+ uaddw v0.8h, v0.8h, v17.8b
+ ld1 {v18.8b}, [x0], x1
+ uaddlp v1.4s, v1.8h
+ uaddlp v2.4s, v2.8h
+ ld1 {v19.8b}, [x0], x1
+
+1: subs x2, x2, #4
+ uaddw v0.8h, v0.8h, v18.8b
+ umull v24.8h, v18.8b, v18.8b
+ ld1 {v20.8b}, [x0], x1
+ uaddw v0.8h, v0.8h, v19.8b
+ umull v25.8h, v19.8b, v19.8b
+ uadalp v1.4s, v24.8h
+ ld1 {v21.8b}, [x0], x1
+ uaddw v0.8h, v0.8h, v20.8b
+ umull v26.8h, v20.8b, v20.8b
+ uadalp v2.4s, v25.8h
+ ld1 {v18.8b}, [x0], x1
+ uaddw v0.8h, v0.8h, v21.8b
+ umull v27.8h, v21.8b, v21.8b
+ uadalp v1.4s, v26.8h
+ ld1 {v19.8b}, [x0], x1
+ uadalp v2.4s, v27.8h
+ b.gt 1b
+
+ uaddw v0.8h, v0.8h, v18.8b
+ umull v28.8h, v18.8b, v18.8b
+ uaddw v0.8h, v0.8h, v19.8b
+ umull v29.8h, v19.8b, v19.8b
+ uadalp v1.4s, v28.8h
+ uadalp v2.4s, v29.8h
+
+ b x264_var_end
+endfunc
+.endm
+
+pixel_var_8 8
+pixel_var_8 16
+
+function x264_pixel_var_16x16_neon, export=1
+ ld1 {v16.16b}, [x0], x1
+ ld1 {v17.16b}, [x0], x1
+ mov x2, #14
+ umull v1.8h, v16.8b, v16.8b
+ umull2 v2.8h, v16.16b, v16.16b
+ uxtl v0.8h, v16.8b
+ uaddlp v1.4s, v1.8h
+ uaddlp v2.4s, v2.8h
+ uaddw2 v0.8h, v0.8h, v16.16b
+
+1: subs x2, x2, #2
+ ld1 {v18.16b}, [x0], x1
+ uaddw v0.8h, v0.8h, v17.8b
+ umull v3.8h, v17.8b, v17.8b
+ uaddw2 v0.8h, v0.8h, v17.16b
+ umull2 v4.8h, v17.16b, v17.16b
+ uadalp v1.4s, v3.8h
+ uadalp v2.4s, v4.8h
+
+ ld1 {v17.16b}, [x0], x1
+ uaddw v0.8h, v0.8h, v18.8b
+ umull v5.8h, v18.8b, v18.8b
+ uaddw2 v0.8h, v0.8h, v18.16b
+ umull2 v6.8h, v18.16b, v18.16b
+ uadalp v1.4s, v5.8h
+ uadalp v2.4s, v6.8h
+ b.gt 1b
+
+ uaddw v0.8h, v0.8h, v17.8b
+ umull v3.8h, v17.8b, v17.8b
+ uaddw2 v0.8h, v0.8h, v17.16b
+ umull2 v4.8h, v17.16b, v17.16b
+ uadalp v1.4s, v3.8h
+ uadalp v2.4s, v4.8h
+endfunc
+
+function x264_var_end
+ add v1.4s, v1.4s, v2.4s
+ uaddlv s0, v0.8h
+ uaddlv d1, v1.4s
+ mov w0, v0.s[0]
+ mov x1, v1.d[0]
+ orr x0, x0, x1, lsl #32
+ ret
+endfunc
+
+
+.macro pixel_var2_8 h
+function x264_pixel_var2_8x\h\()_neon, export=1
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v18.8b}, [x2], x3
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x2], x3
+ mov x5, \h - 4
+ usubl v6.8h, v16.8b, v18.8b
+ usubl v7.8h, v17.8b, v19.8b
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v18.8b}, [x2], x3
+ smull v2.4s, v6.4h, v6.4h
+ smull2 v3.4s, v6.8h, v6.8h
+ add v0.8h, v6.8h, v7.8h
+ smlal v2.4s, v7.4h, v7.4h
+ smlal2 v3.4s, v7.8h, v7.8h
+
+ usubl v6.8h, v16.8b, v18.8b
+
+1: subs x5, x5, #2
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x2], x3
+ smlal v2.4s, v6.4h, v6.4h
+ smlal2 v3.4s, v6.8h, v6.8h
+ usubl v7.8h, v17.8b, v19.8b
+ add v0.8h, v0.8h, v6.8h
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v18.8b}, [x2], x3
+ smlal v2.4s, v7.4h, v7.4h
+ smlal2 v3.4s, v7.8h, v7.8h
+ usubl v6.8h, v16.8b, v18.8b
+ add v0.8h, v0.8h, v7.8h
+ b.gt 1b
+
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v19.8b}, [x2], x3
+ smlal v2.4s, v6.4h, v6.4h
+ smlal2 v3.4s, v6.8h, v6.8h
+ usubl v7.8h, v17.8b, v19.8b
+ add v0.8h, v0.8h, v6.8h
+ smlal v2.4s, v7.4h, v7.4h
+ add v0.8h, v0.8h, v7.8h
+ smlal2 v3.4s, v7.8h, v7.8h
+
+ saddlv s0, v0.8h
+ add v2.4s, v2.4s, v3.4s
+ mov w0, v0.s[0]
+ addv s1, v2.4s
+ sxtw x0, w0
+ mov w1, v1.s[0]
+ mul x0, x0, x0
+ str w1, [x4]
+ sub x0, x1, x0, lsr # 6 + (\h >> 4)
+
+ ret
+endfunc
+.endm
+
+pixel_var2_8 8
+pixel_var2_8 16
+
+
+function x264_pixel_satd_4x4_neon, export=1
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+
+ usubl v0.8h, v0.8b, v1.8b
+ usubl v1.8h, v2.8b, v3.8b
+
+ add v2.8h, v0.8h, v1.8h
+ sub v3.8h, v0.8h, v1.8h
+
+ zip1 v0.2d, v2.2d, v3.2d
+ zip2 v1.2d, v2.2d, v3.2d
+ add v2.8h, v0.8h, v1.8h
+ sub v3.8h, v0.8h, v1.8h
+
+ trn1 v0.8h, v2.8h, v3.8h
+ trn2 v1.8h, v2.8h, v3.8h
+ add v2.8h, v0.8h, v1.8h
+ sub v3.8h, v0.8h, v1.8h
+
+ trn1 v0.4s, v2.4s, v3.4s
+ trn2 v1.4s, v2.4s, v3.4s
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ umax v0.8h, v0.8h, v1.8h
+
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret
+endfunc
+
+function x264_pixel_satd_4x8_neon, export=1
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x0], x1
+ ld1 {v5.s}[0], [x2], x3
+ ld1 {v4.s}[0], [x0], x1
+ ld1 {v7.s}[0], [x2], x3
+ ld1 {v6.s}[0], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x0], x1
+ ld1 {v7.s}[1], [x2], x3
+ ld1 {v6.s}[1], [x0], x1
+ b x264_satd_4x8_8x4_end_neon
+endfunc
+
+function x264_pixel_satd_8x4_neon, export=1
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v4.8b}, [x0], x1
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v6.8b}, [x0], x1
+endfunc
+
+function x264_satd_4x8_8x4_end_neon
+ usubl v0.8h, v0.8b, v1.8b
+ usubl v1.8h, v2.8b, v3.8b
+ usubl v2.8h, v4.8b, v5.8b
+ usubl v3.8h, v6.8b, v7.8b
+
+ add v16.8h, v0.8h, v1.8h
+ sub v17.8h, v0.8h, v1.8h
+ add v18.8h, v2.8h, v3.8h
+ sub v19.8h, v2.8h, v3.8h
+
+ add v4.8h, v16.8h, v18.8h
+ add v5.8h, v17.8h, v19.8h
+ sub v6.8h, v16.8h, v18.8h
+ sub v7.8h, v17.8h, v19.8h
+
+ trn1 v0.8h, v4.8h, v5.8h
+ trn2 v1.8h, v4.8h, v5.8h
+ trn1 v2.8h, v6.8h, v7.8h
+ trn2 v3.8h, v6.8h, v7.8h
+ add v16.8h, v0.8h, v1.8h
+ sub v17.8h, v0.8h, v1.8h
+ add v18.8h, v2.8h, v3.8h
+ sub v19.8h, v2.8h, v3.8h
+
+ trn1 v0.4s, v16.4s, v18.4s
+ trn2 v1.4s, v16.4s, v18.4s
+ trn1 v2.4s, v17.4s, v19.4s
+ trn2 v3.4s, v17.4s, v19.4s
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ abs v2.8h, v2.8h
+ abs v3.8h, v3.8h
+ umax v0.8h, v0.8h, v1.8h
+ umax v1.8h, v2.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret
+endfunc
+
+function x264_pixel_satd_8x8_neon, export=1
+ mov x4, x30
+
+ bl x264_satd_8x8_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret x4
+endfunc
+
+function x264_pixel_satd_8x16_neon, export=1
+ mov x4, x30
+
+ bl x264_satd_8x8_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v30.8h, v0.8h, v1.8h
+
+ bl x264_satd_8x8_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v31.8h, v0.8h, v1.8h
+ add v0.8h, v30.8h, v31.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret x4
+endfunc
+
+.macro SUMSUBL_AB sum, sub, a, b
+ uaddl \sum, \a, \b
+ usubl \sub, \a, \b
+.endm
+
+.macro load_diff_fly_8x8
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ usubl v16.8h, v0.8b, v1.8b
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v4.8b}, [x0], x1
+ usubl v17.8h, v2.8b, v3.8b
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v6.8b}, [x0], x1
+ usubl v18.8h, v4.8b, v5.8b
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v0.8b}, [x0], x1
+ usubl v19.8h, v6.8b, v7.8b
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ usubl v20.8h, v0.8b, v1.8b
+ ld1 {v5.8b}, [x2], x3
+ ld1 {v4.8b}, [x0], x1
+ usubl v21.8h, v2.8b, v3.8b
+ ld1 {v7.8b}, [x2], x3
+ ld1 {v6.8b}, [x0], x1
+
+ SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
+ SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
+
+ usubl v22.8h, v4.8b, v5.8b
+ usubl v23.8h, v6.8b, v7.8b
+
+ SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
+ SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
+.endm
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+ SUMSUB_AB \s1, \d1, \a, \b
+ SUMSUB_AB \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+ SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
+
+function x264_satd_8x8_neon
+ load_diff_fly_8x8
+endfunc
+
+// one vertical hadamard pass and two horizontal
+function x264_satd_8x4v_8x8h_neon
+ HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
+
+ transpose v0.8h, v1.8h, v16.8h, v17.8h
+ transpose v2.8h, v3.8h, v18.8h, v19.8h
+ transpose v4.8h, v5.8h, v20.8h, v21.8h
+ transpose v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
+ SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
+ SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
+ SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
+
+ transpose v0.4s, v2.4s, v16.4s, v18.4s
+ transpose v1.4s, v3.4s, v17.4s, v19.4s
+ transpose v4.4s, v6.4s, v20.4s, v22.4s
+ transpose v5.4s, v7.4s, v21.4s, v23.4s
+
+ abs v0.8h, v0.8h
+ abs v1.8h, v1.8h
+ abs v2.8h, v2.8h
+ abs v3.8h, v3.8h
+ abs v4.8h, v4.8h
+ abs v5.8h, v5.8h
+ abs v6.8h, v6.8h
+ abs v7.8h, v7.8h
+
+ umax v0.8h, v0.8h, v2.8h
+ umax v1.8h, v1.8h, v3.8h
+ umax v2.8h, v4.8h, v6.8h
+ umax v3.8h, v5.8h, v7.8h
+
+ ret
+endfunc
+
+function x264_pixel_satd_16x8_neon, export=1
+ mov x4, x30
+
+ bl x264_satd_16x4_neon
+ add v30.8h, v0.8h, v1.8h
+ add v31.8h, v2.8h, v3.8h
+
+ bl x264_satd_16x4_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v30.8h, v30.8h, v0.8h
+ add v31.8h, v31.8h, v1.8h
+
+ add v0.8h, v30.8h, v31.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret x4
+endfunc
+
+function x264_pixel_satd_16x16_neon, export=1
+ mov x4, x30
+
+ bl x264_satd_16x4_neon
+ add v30.8h, v0.8h, v1.8h
+ add v31.8h, v2.8h, v3.8h
+
+ bl x264_satd_16x4_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v30.8h, v30.8h, v0.8h
+ add v31.8h, v31.8h, v1.8h
+
+ bl x264_satd_16x4_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v30.8h, v30.8h, v0.8h
+ add v31.8h, v31.8h, v1.8h
+
+ bl x264_satd_16x4_neon
+ add v0.8h, v0.8h, v1.8h
+ add v1.8h, v2.8h, v3.8h
+ add v30.8h, v30.8h, v0.8h
+ add v31.8h, v31.8h, v1.8h
+
+ add v0.8h, v30.8h, v31.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ ret x4
+endfunc
+
+function x264_satd_16x4_neon
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v3.16b}, [x2], x3
+ ld1 {v2.16b}, [x0], x1
+ ld1 {v5.16b}, [x2], x3
+ ld1 {v4.16b}, [x0], x1
+ ld1 {v7.16b}, [x2], x3
+ ld1 {v6.16b}, [x0], x1
+
+ usubl v16.8h, v0.8b, v1.8b
+ usubl2 v20.8h, v0.16b, v1.16b
+ usubl v17.8h, v2.8b, v3.8b
+ usubl2 v21.8h, v2.16b, v3.16b
+ usubl v18.8h, v4.8b, v5.8b
+ usubl2 v22.8h, v4.16b, v5.16b
+ usubl v19.8h, v6.8b, v7.8b
+ usubl2 v23.8h, v6.16b, v7.16b
+
+ SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
+ SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
+
+ SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
+ SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
+
+ b x264_satd_8x4v_8x8h_neon
+endfunc
+
+
+function x264_pixel_sa8d_8x8_neon, export=1
+ mov x4, x30
+ bl x264_sa8d_8x8_neon
+ add v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ add w0, w0, #1
+ lsr w0, w0, #1
+ ret x4
+endfunc
+
+function x264_pixel_sa8d_16x16_neon, export=1
+ mov x4, x30
+ bl x264_sa8d_8x8_neon
+ uaddlp v30.4s, v0.8h
+ uaddlp v31.4s, v1.8h
+ bl x264_sa8d_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ sub x0, x0, x1, lsl #4
+ sub x2, x2, x3, lsl #4
+ add x0, x0, #8
+ add x2, x2, #8
+ bl x264_sa8d_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ bl x264_sa8d_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ add v0.4s, v30.4s, v31.4s
+ addv s0, v0.4s
+ mov w0, v0.s[0]
+ add w0, w0, #1
+ lsr w0, w0, #1
+ ret x4
+endfunc
+
+function x264_sa8d_8x8_neon
+ load_diff_fly_8x8
+
+ HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
+ SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
+ SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
+ SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
+ SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
+
+ transpose v20.8h, v21.8h, v16.8h, v17.8h
+ transpose v4.8h, v5.8h, v0.8h, v1.8h
+ transpose v22.8h, v23.8h, v18.8h, v19.8h
+ transpose v6.8h, v7.8h, v2.8h, v3.8h
+
+ SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h
+ SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
+ SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
+ SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h
+
+ transpose v20.4s, v22.4s, v28.4s, v0.4s
+ transpose v21.4s, v23.4s, v29.4s, v1.4s
+ transpose v16.4s, v18.4s, v24.4s, v26.4s
+ transpose v17.4s, v19.4s, v25.4s, v27.4s
+
+ SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
+ SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
+ SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
+ SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
+
+ transpose v16.2d, v20.2d, v0.2d, v4.2d
+ transpose v17.2d, v21.2d, v1.2d, v5.2d
+ transpose v18.2d, v22.2d, v2.2d, v6.2d
+ transpose v19.2d, v23.2d, v3.2d, v7.2d
+
+ abs v16.8h, v16.8h
+ abs v20.8h, v20.8h
+ abs v17.8h, v17.8h
+ abs v21.8h, v21.8h
+ abs v18.8h, v18.8h
+ abs v22.8h, v22.8h
+ abs v19.8h, v19.8h
+ abs v23.8h, v23.8h
+
+ umax v16.8h, v16.8h, v20.8h
+ umax v17.8h, v17.8h, v21.8h
+ umax v18.8h, v18.8h, v22.8h
+ umax v19.8h, v19.8h, v23.8h
+
+ add v0.8h, v16.8h, v17.8h
+ add v1.8h, v18.8h, v19.8h
+
+ ret
+endfunc
+
+
+.macro HADAMARD_AC w h
+function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
+ movrel x5, mask_ac_4_8
+ mov x4, x30
+ ld1 {v30.8h,v31.8h}, [x5]
+ movi v28.16b, #0
+ movi v29.16b, #0
+
+ bl x264_hadamard_ac_8x8_neon
+.if \h > 8
+ bl x264_hadamard_ac_8x8_neon
+.endif
+.if \w > 8
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ bl x264_hadamard_ac_8x8_neon
+.endif
+.if \w * \h == 256
+ sub x0, x0, x1, lsl #4
+ bl x264_hadamard_ac_8x8_neon
+.endif
+
+ addv s1, v29.4s
+ addv s0, v28.4s
+ mov w1, v1.s[0]
+ mov w0, v0.s[0]
+ lsr w1, w1, #2
+ lsr w0, w0, #1
+ orr x0, x0, x1, lsl #32
+ ret x4
+endfunc
+.endm
+
+HADAMARD_AC 8, 8
+HADAMARD_AC 8, 16
+HADAMARD_AC 16, 8
+HADAMARD_AC 16, 16
+
+// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
+function x264_hadamard_ac_8x8_neon
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x0], x1
+ ld1 {v18.8b}, [x0], x1
+ ld1 {v19.8b}, [x0], x1
+ SUMSUBL_AB v0.8h, v1.8h, v16.8b, v17.8b
+ ld1 {v20.8b}, [x0], x1
+ ld1 {v21.8b}, [x0], x1
+ SUMSUBL_AB v2.8h, v3.8h, v18.8b, v19.8b
+ ld1 {v22.8b}, [x0], x1
+ ld1 {v23.8b}, [x0], x1
+ SUMSUBL_AB v4.8h, v5.8h, v20.8b, v21.8b
+ SUMSUBL_AB v6.8h, v7.8h, v22.8b, v23.8b
+
+ SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
+ SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
+
+ transpose v0.8h, v1.8h, v16.8h, v17.8h
+ transpose v2.8h, v3.8h, v18.8h, v19.8h
+ transpose v4.8h, v5.8h, v20.8h, v21.8h
+ transpose v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
+ SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
+ SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
+ SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
+
+ transpose v0.4s, v2.4s, v16.4s, v18.4s
+ transpose v1.4s, v3.4s, v17.4s, v19.4s
+ transpose v4.4s, v6.4s, v20.4s, v22.4s
+ transpose v5.4s, v7.4s, v21.4s, v23.4s
+
+ SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
+ SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
+ SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
+
+ abs v0.8h, v16.8h
+ abs v4.8h, v20.8h
+ abs v1.8h, v17.8h
+ abs v5.8h, v21.8h
+ abs v2.8h, v18.8h
+ abs v6.8h, v22.8h
+ abs v3.8h, v19.8h
+ abs v7.8h, v23.8h
+
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ and v0.16b, v0.16b, v30.16b
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ uadalp v28.4s, v0.8h
+ uadalp v28.4s, v1.8h
+
+ SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
+ SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
+ SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
+ SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
+
+ transpose v16.2d, v17.2d, v6.2d, v7.2d
+ transpose v18.2d, v19.2d, v4.2d, v5.2d
+ transpose v20.2d, v21.2d, v2.2d, v3.2d
+
+ abs v16.8h, v16.8h
+ abs v17.8h, v17.8h
+ abs v18.8h, v18.8h
+ abs v19.8h, v19.8h
+ abs v20.8h, v20.8h
+ abs v21.8h, v21.8h
+
+ transpose v7.2d, v6.2d, v1.2d, v0.2d
+
+ umax v3.8h, v16.8h, v17.8h
+ umax v2.8h, v18.8h, v19.8h
+ umax v1.8h, v20.8h, v21.8h
+
+ SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
+
+ add v2.8h, v2.8h, v3.8h
+ add v2.8h, v2.8h, v1.8h
+ and v4.16b, v4.16b, v31.16b
+ add v2.8h, v2.8h, v2.8h
+ abs v5.8h, v5.8h
+ abs v4.8h, v4.8h
+ add v2.8h, v2.8h, v5.8h
+ add v2.8h, v2.8h, v4.8h
+ uadalp v29.4s, v2.8h
+ ret
+endfunc
+
+
+function x264_pixel_ssim_4x4x2_core_neon, export=1
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v2.8b}, [x2], x3
+ umull v16.8h, v0.8b, v0.8b
+ umull v17.8h, v0.8b, v2.8b
+ umull v18.8h, v2.8b, v2.8b
+
+ ld1 {v28.8b}, [x0], x1
+ ld1 {v29.8b}, [x2], x3
+ umull v20.8h, v28.8b, v28.8b
+ umull v21.8h, v28.8b, v29.8b
+ umull v22.8h, v29.8b, v29.8b
+
+ uaddlp v16.4s, v16.8h
+ uaddlp v17.4s, v17.8h
+ uaddl v0.8h, v0.8b, v28.8b
+ uadalp v16.4s, v18.8h
+ uaddl v1.8h, v2.8b, v29.8b
+
+ ld1 {v26.8b}, [x0], x1
+ ld1 {v27.8b}, [x2], x3
+ umull v23.8h, v26.8b, v26.8b
+ umull v24.8h, v26.8b, v27.8b
+ umull v25.8h, v27.8b, v27.8b
+
+ uadalp v16.4s, v20.8h
+ uaddw v0.8h, v0.8h, v26.8b
+ uadalp v17.4s, v21.8h
+ uaddw v1.8h, v1.8h, v27.8b
+ uadalp v16.4s, v22.8h
+
+ ld1 {v28.8b}, [x0], x1
+ ld1 {v29.8b}, [x2], x3
+ umull v20.8h, v28.8b, v28.8b
+ umull v21.8h, v28.8b, v29.8b
+ umull v22.8h, v29.8b, v29.8b
+
+ uadalp v16.4s, v23.8h
+ uaddw v0.8h, v0.8h, v28.8b
+ uadalp v17.4s, v24.8h
+ uaddw v1.8h, v1.8h, v29.8b
+ uadalp v16.4s, v25.8h
+
+ uadalp v16.4s, v20.8h
+ uadalp v17.4s, v21.8h
+ uadalp v16.4s, v22.8h
+
+ uaddlp v0.4s, v0.8h
+ uaddlp v1.4s, v1.8h
+
+ addp v0.4s, v0.4s, v0.4s
+ addp v1.4s, v1.4s, v1.4s
+ addp v2.4s, v16.4s, v16.4s
+ addp v3.4s, v17.4s, v17.4s
+
+ st4 {v0.2s,v1.2s,v2.2s,v3.2s}, [x4]
+ ret
+endfunc
+
+function x264_pixel_ssim_end4_neon, export=1
+ mov x5, #4
+ ld1 {v16.4s,v17.4s}, [x0], #32
+ ld1 {v18.4s,v19.4s}, [x1], #32
+ mov w4, #0x99bb
+ subs x2, x5, w2, uxtw
+ mov w3, #416 // ssim_c1 = .01*.01*255*255*64
+ movk w4, #0x03, lsl #16 // ssim_c2 = .03*.03*255*255*64*63
+ add v0.4s, v16.4s, v18.4s
+ add v1.4s, v17.4s, v19.4s
+ add v0.4s, v0.4s, v1.4s
+ ld1 {v20.4s,v21.4s}, [x0], #32
+ ld1 {v22.4s,v23.4s}, [x1], #32
+ add v2.4s, v20.4s, v22.4s
+ add v3.4s, v21.4s, v23.4s
+ add v1.4s, v1.4s, v2.4s
+ ld1 {v16.4s}, [x0], #16
+ ld1 {v18.4s}, [x1], #16
+ add v16.4s, v16.4s, v18.4s
+ add v2.4s, v2.4s, v3.4s
+ add v3.4s, v3.4s, v16.4s
+
+ dup v30.4s, w3
+ dup v31.4s, w4
+
+ transpose v4.4s, v5.4s, v0.4s, v1.4s
+ transpose v6.4s, v7.4s, v2.4s, v3.4s
+ transpose v0.2d, v2.2d, v4.2d, v6.2d
+ transpose v1.2d, v3.2d, v5.2d, v7.2d
+
+ mul v16.4s, v0.4s, v1.4s // s1*s2
+ mul v0.4s, v0.4s, v0.4s
+ mla v0.4s, v1.4s, v1.4s // s1*s1 + s2*s2
+
+ shl v3.4s, v3.4s, #7
+ shl v2.4s, v2.4s, #6
+ add v1.4s, v16.4s, v16.4s
+
+ sub v2.4s, v2.4s, v0.4s // vars
+ sub v3.4s, v3.4s, v1.4s // covar*2
+ add v0.4s, v0.4s, v30.4s
+ add v2.4s, v2.4s, v31.4s
+ add v1.4s, v1.4s, v30.4s
+ add v3.4s, v3.4s, v31.4s
+
+ scvtf v0.4s, v0.4s
+ scvtf v2.4s, v2.4s
+ scvtf v1.4s, v1.4s
+ scvtf v3.4s, v3.4s
+
+ fmul v0.4s, v0.4s, v2.4s
+ fmul v1.4s, v1.4s, v3.4s
+
+ fdiv v0.4s, v1.4s, v0.4s
+
+ b.eq 1f
+ movrel x3, mask
+ add x3, x3, x2, lsl #2
+ ld1 {v29.4s}, [x3]
+ and v0.16b, v0.16b, v29.16b
+1:
+ faddp v0.4s, v0.4s, v0.4s
+ faddp s0, v0.2s
+ ret
+endfunc
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
new file mode 100644
index 0000000..8f562f6
--- /dev/null
+++ b/common/aarch64/pixel.h
@@ -0,0 +1,72 @@
+/*****************************************************************************
+ * pixel.h: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_PIXEL_H
+#define X264_AARCH64_PIXEL_H
+
+#define DECL_PIXELS( ret, name, suffix, args ) \
+ ret x264_pixel_##name##_16x16_##suffix args;\
+ ret x264_pixel_##name##_16x8_##suffix args;\
+ ret x264_pixel_##name##_8x16_##suffix args;\
+ ret x264_pixel_##name##_8x8_##suffix args;\
+ ret x264_pixel_##name##_8x4_##suffix args;\
+ ret x264_pixel_##name##_4x8_##suffix args;\
+ ret x264_pixel_##name##_4x4_##suffix args;\
+
+#define DECL_X1( name, suffix ) \
+ DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+
+#define DECL_X4( name, suffix ) \
+ DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+
+int x264_pixel_sad_4x4_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sad_4x8_armv6( uint8_t *, intptr_t, uint8_t *, intptr_t );
+
+DECL_X1( sad, neon )
+DECL_X4( sad, neon )
+DECL_X1( satd, neon )
+DECL_X1( ssd, neon )
+
+int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
+int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+
+uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
+int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+
+uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x8_neon ( uint8_t *, intptr_t );
+uint64_t x264_pixel_hadamard_ac_16x16_neon( uint8_t *, intptr_t );
+
+void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
+ const uint8_t *, intptr_t,
+ int sums[2][4] );
+float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
+
+#endif
diff --git a/common/pixel.c b/common/pixel.c
index a06f5db..3a8333d 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -38,6 +38,9 @@
# include "arm/pixel.h"
# include "arm/predict.h"
#endif
+#if ARCH_AARCH64
+# include "aarch64/pixel.h"
+#endif
/****************************************************************************
@@ -496,7 +499,7 @@ SATD_X_DECL7( _xop )
#endif
#if !HIGH_BIT_DEPTH
-#if HAVE_ARMV6
+#if HAVE_ARMV6 || ARCH_AARCH64
SATD_X_DECL7( _neon )
#endif
#endif // !HIGH_BIT_DEPTH
@@ -524,6 +527,10 @@ INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
INTRA_MBCMP_8x8( sad, _neon, _neon )
INTRA_MBCMP_8x8(sa8d, _neon, _neon )
#endif
+#if !HIGH_BIT_DEPTH && ARCH_AARCH64
+INTRA_MBCMP_8x8( sad, _neon, _c )
+INTRA_MBCMP_8x8(sa8d, _neon, _c )
+#endif
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
@@ -589,6 +596,16 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
#endif
+#if !HIGH_BIT_DEPTH && ARCH_AARCH64
+INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c )
+INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c )
+INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _c )
+INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _c )
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c )
+INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _c )
+INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _c )
+#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
// or see x264_mb_analyse_intra for the entirely different algorithm we
@@ -1390,6 +1407,46 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
}
#endif
+
+#if ARCH_AARCH64
+ if( cpu&X264_CPU_NEON )
+ {
+ INIT7( sad, _neon );
+ // AArch64 has no distinct instructions for aligned load/store
+ INIT7_NAME( sad_aligned, sad, _neon );
+ INIT7( sad_x3, _neon );
+ INIT7( sad_x4, _neon );
+ INIT7( ssd, _neon );
+ INIT7( satd, _neon );
+ INIT7( satd_x3, _neon );
+ INIT7( satd_x4, _neon );
+ INIT4( hadamard_ac, _neon );
+
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
+ pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
+
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
+
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
+ }
+#endif // ARCH_AARCH64
+
#endif // HIGH_BIT_DEPTH
#if HAVE_ALTIVEC
if( cpu&X264_CPU_ALTIVEC )
--
2.0.0
More information about the x264-devel
mailing list