[x264-devel] [PATCH 7/9] aarch64: motion compensation NEON asm
Janne Grunau
janne-x264 at jannau.net
Sat Jul 19 20:57:53 CEST 2014
Ported from the ARM NEON asm.
---
Makefile | 3 +-
common/aarch64/mc-a.S | 1390 +++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/mc-c.c | 253 +++++++++
common/aarch64/mc.h | 29 ++
common/mc.c | 6 +
5 files changed, 1680 insertions(+), 1 deletion(-)
create mode 100644 common/aarch64/mc-a.S
create mode 100644 common/aarch64/mc-c.c
create mode 100644 common/aarch64/mc.h
diff --git a/Makefile b/Makefile
index d903393..b0d4a14 100644
--- a/Makefile
+++ b/Makefile
@@ -127,9 +127,10 @@ endif
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
ASMSRC += common/aarch64/dct-a.S \
+ common/aarch64/mc-a.S \
common/aarch64/pixel-a.S \
common/aarch64/quant-a.S
-SRCS +=
+SRCS += common/aarch64/mc-c.c
OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
new file mode 100644
index 0000000..4a99a64
--- /dev/null
+++ b/common/aarch64/mc-a.S
@@ -0,0 +1,1390 @@
+/*****************************************************************************
+ * mc.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ * Mans Rullgard <mans at mansr.com>
+ * Stefan Groenroos <stefan.gronroos at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+// note: prefetch stuff assumes 64-byte cacheline
+
+// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
+function x264_prefetch_ref_aarch64, export=1
+ cmp w2, #1
+ csel x2, xzr, x1, eq
+ add x0, x0, #64
+ add x0, x0, x2, lsl #3
+
+ lsl x2, x1, #1
+ add x3, x1, x1, lsl #1
+ add x4, x0, x1, lsl #2
+
+ prfm pldl1strm, [x0]
+ prfm pldl1strm, [x0, x1]
+ prfm pldl1strm, [x0, x2]
+ prfm pldl1strm, [x0, x3]
+ prfm pldl1strm, [x4]
+ prfm pldl1strm, [x4, x1]
+ prfm pldl1strm, [x4, x2]
+ prfm pldl1strm, [x4, x3]
+ ret
+endfunc
+
+// void prefetch_fenc( uint8_t *pix_y, intptr_t stride_y,
+// uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
+.macro x264_prefetch_fenc sub
+function x264_prefetch_fenc_\sub\()_aarch64, export=1
+ and w6, w5, #3
+ and w7, w5, #3
+ mul x6, x6, x1
+ mul x7, x7, x3
+ add x0, x0, #64
+ add x2, x2, #64
+
+ add x0, x0, x6, lsl #2
+ add x6, x0, x1, lsl #1
+ prfm pldl1strm, [x0]
+ prfm pldl1strm, [x0, x1]
+ prfm pldl1strm, [x6]
+ prfm pldl1strm, [x6, x1]
+
+ add x2, x2, x7, lsl #1
+ prfm pldl1strm, [x2]
+ prfm pldl1strm, [x2, x3]
+.ifc \sub, 422
+ add x7, x2, x3, lsl #1
+ prfm pldl1strm, [x7]
+ prfm pldl1strm, [x7, x3]
+.endif
+ ret
+endfunc
+.endm
+
+x264_prefetch_fenc 420
+x264_prefetch_fenc 422
+
+// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
+// uint8_t *src1, intptr_t src1_stride,
+// uint8_t *src2, intptr_t src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon, export=1
+ mov w10, #64
+ cmp w6, #32
+ mov w9, #\h
+ b.eq pixel_avg_w\w\()_neon
+ subs w7, w10, w6
+ b.lt pixel_avg_weight_w\w\()_add_sub_neon // weight > 64
+ cmp w6, #0
+ b.ge pixel_avg_weight_w\w\()_add_add_neon
+ b pixel_avg_weight_w\w\()_sub_add_neon // weight < 0
+endfunc
+.endm
+
+AVGH 4, 2
+AVGH 4, 4
+AVGH 4, 8
+AVGH 4, 16
+AVGH 8, 4
+AVGH 8, 8
+AVGH 8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+ mov w6, w6
+.endm
+.macro weight_add_add dst, s1, s2, h=
+.ifc \h, 2
+ umull2 \dst, \s1, v30.16b
+ umlal2 \dst, \s2, v31.16b
+.else
+ umull \dst, \s1, v30.8b
+ umlal \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+ neg w7, w7
+.endm
+.macro weight_add_sub dst, s1, s2, h=
+.ifc \h, 2
+ umull2 \dst, \s1, v30.16b
+ umlsl2 \dst, \s2, v31.16b
+.else
+ umull \dst, \s1, v30.8b
+ umlsl \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+ neg w6, w6
+.endm
+.macro weight_sub_add dst, s1, s2, h=
+.ifc \h, 2
+ umull2 \dst, \s2, v31.16b
+ umlsl2 \dst, \s1, v30.16b
+.else
+ umull \dst, \s2, v31.8b
+ umlsl \dst, \s1, v30.8b
+.endif
+.endm
+
+.macro AVG_WEIGHT ext
+function pixel_avg_weight_w4_\ext\()_neon
+ load_weights_\ext
+ dup v30.8b, w6
+ dup v31.8b, w7
+1: // height loop
+ subs w9, w9, #2
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x4], x5
+ weight_\ext v4.8h, v0.8b, v1.8b
+ ld1 {v2.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x4], x5
+ sqrshrun v0.8b, v4.8h, #6
+ weight_\ext v5.8h, v2.8b, v3.8b
+ st1 {v0.s}[0], [x0], x1
+ sqrshrun v1.8b, v5.8h, #6
+ st1 {v1.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function pixel_avg_weight_w8_\ext\()_neon
+ load_weights_\ext
+ dup v30.8b, w6
+ dup v31.8b, w7
+1: // height loop
+ subs w9, w9, #4
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x4], x5
+ weight_\ext v16.8h, v0.8b, v1.8b
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x4], x5
+ weight_\ext v17.8h, v2.8b, v3.8b
+ ld1 {v4.8b}, [x2], x3
+ ld1 {v5.8b}, [x4], x5
+ weight_\ext v18.8h, v4.8b, v5.8b
+ ld1 {v6.8b}, [x2], x3
+ ld1 {v7.8b}, [x4], x5
+ weight_\ext v19.8h, v6.8b, v7.8b
+ sqrshrun v0.8b, v16.8h, #6
+ sqrshrun v1.8b, v17.8h, #6
+ sqrshrun v2.8b, v18.8h, #6
+ sqrshrun v3.8b, v19.8h, #6
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function pixel_avg_weight_w16_\ext\()_neon
+ load_weights_\ext
+ dup v30.16b, w6
+ dup v31.16b, w7
+1: // height loop
+ subs w9, w9, #2
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x4], x5
+ weight_\ext v16.8h, v0.8b, v1.8b
+ weight_\ext v17.8h, v0.16b, v1.16b, 2
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v3.16b}, [x4], x5
+ weight_\ext v18.8h, v2.8b, v3.8b
+ weight_\ext v19.8h, v2.16b, v3.16b, 2
+ sqrshrun v0.8b, v16.8h, #6
+ sqrshrun v1.8b, v18.8h, #6
+ sqrshrun2 v0.16b, v17.8h, #6
+ sqrshrun2 v1.16b, v19.8h, #6
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+AVG_WEIGHT add_add
+AVG_WEIGHT add_sub
+AVG_WEIGHT sub_add
+
+function pixel_avg_w4_neon
+1: subs w9, w9, #2
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x4], x5
+ urhadd v0.8b, v0.8b, v2.8b
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x4], x5
+ urhadd v1.8b, v1.8b, v3.8b
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function pixel_avg_w8_neon
+1: subs w9, w9, #4
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x4], x5
+ ld1 {v2.8b}, [x2], x3
+ urhadd v0.8b, v0.8b, v1.8b
+ ld1 {v3.8b}, [x4], x5
+ st1 {v0.8b}, [x0], x1
+ ld1 {v4.8b}, [x2], x3
+ urhadd v1.8b, v2.8b, v3.8b
+ ld1 {v5.8b}, [x4], x5
+ st1 {v1.8b}, [x0], x1
+ ld1 {v6.8b}, [x2], x3
+ ld1 {v7.8b}, [x4], x5
+ urhadd v2.8b, v4.8b, v5.8b
+ urhadd v3.8b, v6.8b, v7.8b
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function pixel_avg_w16_neon
+1: subs w9, w9, #4
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x4], x5
+ ld1 {v2.16b}, [x2], x3
+ urhadd v0.16b, v0.16b, v1.16b
+ ld1 {v3.16b}, [x4], x5
+ st1 {v0.16b}, [x0], x1
+ ld1 {v4.16b}, [x2], x3
+ urhadd v1.16b, v2.16b, v3.16b
+ ld1 {v5.16b}, [x4], x5
+ st1 {v1.16b}, [x0], x1
+ ld1 {v6.16b}, [x2], x3
+ ld1 {v7.16b}, [x4], x5
+ urhadd v2.16b, v4.16b, v5.16b
+ urhadd v3.16b, v6.16b, v7.16b
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_pixel_avg2_w4_neon, export=1
+1:
+ subs w5, w5, #2
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x4], x3
+ urhadd v0.8b, v0.8b, v2.8b
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x4], x3
+ urhadd v1.8b, v1.8b, v3.8b
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_pixel_avg2_w8_neon, export=1
+1:
+ subs w5, w5, #2
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v2.8b}, [x4], x3
+ urhadd v0.8b, v0.8b, v2.8b
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v3.8b}, [x4], x3
+ urhadd v1.8b, v1.8b, v3.8b
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_pixel_avg2_w16_neon, export=1
+1:
+ subs w5, w5, #2
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v2.16b}, [x4], x3
+ urhadd v0.16b, v0.16b, v2.16b
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v3.16b}, [x4], x3
+ urhadd v1.16b, v1.16b, v3.16b
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_pixel_avg2_w20_neon, export=1
+ sub x1, x1, #16
+1:
+ subs w5, w5, #2
+ ld1 {v0.16b,v1.16b}, [x2], x3
+ ld1 {v2.16b,v3.16b}, [x4], x3
+ urhadd v0.16b, v0.16b, v2.16b
+ urhadd v1.8b, v1.8b, v3.8b
+ ld1 {v4.16b,v5.16b}, [x2], x3
+ ld1 {v6.16b,v7.16b}, [x4], x3
+ urhadd v4.16b, v4.16b, v6.16b
+ urhadd v5.8b, v5.8b, v7.8b
+ st1 {v0.16b}, [x0], #16
+ st1 {v1.s}[0], [x0], x1
+ st1 {v4.16b}, [x0], #16
+ st1 {v5.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro weight_prologue type
+ mov w9, w5 // height
+.ifc \type, full
+ ldr w12, [x4, #32] // denom
+.endif
+ ldp w4, w5, [x4, #32+4] // scale, offset
+ dup v0.8h, w4
+ dup v1.8h, w5
+.ifc \type, full
+ neg w12, w12
+ dup v2.8h, w12
+.endif
+.endm
+
+// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
+// intptr_t dst_stride, const x264_weight_t *weight, int h )
+function x264_mc_weight_w20_neon, export=1
+ weight_prologue full
+ sub x1, x1, #16
+1:
+ subs w9, w9, #2
+ ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
+ ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
+ uxtl v22.8h, v16.8b
+ uxtl v23.8h, v17.8b
+ zip1 v18.2s, v18.2s, v21.2s
+ uxtl v24.8h, v19.8b
+ uxtl v25.8h, v20.8b
+ uxtl v26.8h, v18.8b
+ mul v22.8h, v22.8h, v0.8h
+ mul v23.8h, v23.8h, v0.8h
+ mul v24.8h, v24.8h, v0.8h
+ mul v25.8h, v25.8h, v0.8h
+ mul v26.8h, v26.8h, v0.8h
+ srshl v22.8h, v22.8h, v2.8h
+ srshl v23.8h, v23.8h, v2.8h
+ srshl v24.8h, v24.8h, v2.8h
+ srshl v25.8h, v25.8h, v2.8h
+ srshl v26.8h, v26.8h, v2.8h
+ add v22.8h, v22.8h, v1.8h
+ add v23.8h, v23.8h, v1.8h
+ add v24.8h, v24.8h, v1.8h
+ add v25.8h, v25.8h, v1.8h
+ add v26.8h, v26.8h, v1.8h
+ sqxtun v4.8b, v22.8h
+ sqxtun2 v4.16b, v23.8h
+ sqxtun v5.8b, v24.8h
+ sqxtun2 v5.16b, v25.8h
+ sqxtun v6.8b, v26.8h
+ st1 {v4.16b}, [x0], #16
+ st1 {v6.s}[0], [x0], x1
+ st1 {v5.16b}, [x0], #16
+ st1 {v6.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w16_neon, export=1
+ weight_prologue full
+weight16_loop:
+1:
+ subs w9, w9, #2
+ ld1 {v4.16b}, [x2], x3
+ ld1 {v5.16b}, [x2], x3
+ uxtl v22.8h, v4.8b
+ uxtl2 v23.8h, v4.16b
+ uxtl v24.8h, v5.8b
+ uxtl2 v25.8h, v5.16b
+ mul v22.8h, v22.8h, v0.8h
+ mul v23.8h, v23.8h, v0.8h
+ mul v24.8h, v24.8h, v0.8h
+ mul v25.8h, v25.8h, v0.8h
+ srshl v22.8h, v22.8h, v2.8h
+ srshl v23.8h, v23.8h, v2.8h
+ srshl v24.8h, v24.8h, v2.8h
+ srshl v25.8h, v25.8h, v2.8h
+ add v22.8h, v22.8h, v1.8h
+ add v23.8h, v23.8h, v1.8h
+ add v24.8h, v24.8h, v1.8h
+ add v25.8h, v25.8h, v1.8h
+ sqxtun v4.8b, v22.8h
+ sqxtun2 v4.16b, v23.8h
+ sqxtun v5.8b, v24.8h
+ sqxtun2 v5.16b, v25.8h
+ st1 {v4.16b}, [x0], x1
+ st1 {v5.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w8_neon, export=1
+ weight_prologue full
+1:
+ subs w9, w9, #2
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ uxtl v4.8h, v16.8b
+ uxtl v5.8h, v17.8b
+ mul v4.8h, v4.8h, v0.8h
+ mul v5.8h, v5.8h, v0.8h
+ srshl v4.8h, v4.8h, v2.8h
+ srshl v5.8h, v5.8h, v2.8h
+ add v4.8h, v4.8h, v1.8h
+ add v5.8h, v5.8h, v1.8h
+ sqxtun v16.8b, v4.8h
+ sqxtun v17.8b, v5.8h
+ st1 {v16.8b}, [x0], x1
+ st1 {v17.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w4_neon, export=1
+ weight_prologue full
+1:
+ subs w9, w9, #2
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v16.s}[1], [x2], x3
+ uxtl v4.8h, v16.8b
+ mul v4.8h, v4.8h, v0.8h
+ srshl v4.8h, v4.8h, v2.8h
+ add v4.8h, v4.8h, v1.8h
+ sqxtun v16.8b, v4.8h
+ st1 {v16.s}[0], [x0], x1
+ st1 {v16.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w20_nodenom_neon, export=1
+ weight_prologue nodenom
+ sub x1, x1, #16
+1:
+ subs w9, w9, #2
+ ld1 {v16.8b,v17.8b,v18.8b}, [x2], x3
+ mov v27.16b, v1.16b
+ mov v28.16b, v1.16b
+ ld1 {v19.8b,v20.8b,v21.8b}, [x2], x3
+ mov v29.16b, v1.16b
+ mov v30.16b, v1.16b
+ uxtl v22.8h, v16.8b
+ uxtl v23.8h, v17.8b
+ zip1 v18.2s, v18.2s, v21.2s
+ mov v31.16b, v1.16b
+ uxtl v24.8h, v19.8b
+ uxtl v25.8h, v20.8b
+ uxtl v26.8h, v18.8b
+ mla v27.8h, v22.8h, v0.8h
+ mla v28.8h, v23.8h, v0.8h
+ mla v29.8h, v24.8h, v0.8h
+ mla v30.8h, v25.8h, v0.8h
+ mla v31.8h, v26.8h, v0.8h
+ sqxtun v4.8b, v27.8h
+ sqxtun2 v4.16b, v28.8h
+ sqxtun v5.8b, v29.8h
+ sqxtun2 v5.16b, v30.8h
+ sqxtun v6.8b, v31.8h
+ st1 {v4.16b}, [x0], #16
+ st1 {v6.s}[0], [x0], x1
+ st1 {v5.16b}, [x0], #16
+ st1 {v6.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w16_nodenom_neon, export=1
+ weight_prologue nodenom
+1:
+ subs w9, w9, #2
+ ld1 {v16.16b}, [x2], x3
+ mov v27.16b, v1.16b
+ mov v28.16b, v1.16b
+ ld1 {v17.16b}, [x2], x3
+ mov v29.16b, v1.16b
+ mov v30.16b, v1.16b
+ uxtl v22.8h, v16.8b
+ uxtl2 v23.8h, v16.16b
+ uxtl v24.8h, v17.8b
+ uxtl2 v25.8h, v17.16b
+ mla v27.8h, v22.8h, v0.8h
+ mla v28.8h, v23.8h, v0.8h
+ mla v29.8h, v24.8h, v0.8h
+ mla v30.8h, v25.8h, v0.8h
+ sqxtun v4.8b, v27.8h
+ sqxtun2 v4.16b, v28.8h
+ sqxtun v5.8b, v29.8h
+ sqxtun2 v5.16b, v30.8h
+ st1 {v4.16b}, [x0], x1
+ st1 {v5.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w8_nodenom_neon, export=1
+ weight_prologue nodenom
+1:
+ subs w9, w9, #2
+ ld1 {v16.8b}, [x2], x3
+ mov v27.16b, v1.16b
+ ld1 {v17.8b}, [x2], x3
+ mov v29.16b, v1.16b
+ uxtl v22.8h, v16.8b
+ uxtl v24.8h, v17.8b
+ mla v27.8h, v22.8h, v0.8h
+ mla v29.8h, v24.8h, v0.8h
+ sqxtun v4.8b, v27.8h
+ sqxtun v5.8b, v29.8h
+ st1 {v4.8b}, [x0], x1
+ st1 {v5.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w4_nodenom_neon, export=1
+ weight_prologue nodenom
+1:
+ subs w9, w9, #2
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v16.s}[1], [x2], x3
+ mov v27.16b, v1.16b
+ uxtl v22.8h, v16.8b
+ mla v27.8h, v22.8h, v0.8h
+ sqxtun v4.8b, v27.8h
+ st1 {v4.s}[0], [x0], x1
+ st1 {v4.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+.macro weight_simple_prologue
+ ldr w6, [x4] // offset
+ dup v1.16b, w6
+.endm
+
+.macro weight_simple name op
+function x264_mc_weight_w20_\name\()_neon, export=1
+ weight_simple_prologue
+1:
+ subs w5, w5, #2
+ ldr s18, [x2, #16]
+ ld1 {v16.16b}, [x2], x3
+ ldr s19, [x2, #16]
+ ld1 {v17.16b}, [x2], x3
+ \op v18.8b, v18.8b, v1.8b
+ \op v16.16b, v16.16b, v1.16b
+ \op v19.8b, v19.8b, v1.8b
+ \op v17.16b, v17.16b, v1.16b
+ str s18, [x0, #16]
+ st1 {v16.16b}, [x0], x1
+ str s19, [x0, #16]
+ st1 {v17.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w16_\name\()_neon, export=1
+ weight_simple_prologue
+1:
+ subs w5, w5, #2
+ ld1 {v16.16b}, [x2], x3
+ ld1 {v17.16b}, [x2], x3
+ \op v16.16b, v16.16b, v1.16b
+ \op v17.16b, v17.16b, v1.16b
+ st1 {v16.16b}, [x0], x1
+ st1 {v17.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w8_\name\()_neon, export=1
+ weight_simple_prologue
+1:
+ subs w5, w5, #2
+ ld1 {v16.8b}, [x2], x3
+ ld1 {v17.8b}, [x2], x3
+ \op v16.8b, v16.8b, v1.8b
+ \op v17.8b, v17.8b, v1.8b
+ st1 {v16.8b}, [x0], x1
+ st1 {v17.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_weight_w4_\name\()_neon, export=1
+ weight_simple_prologue
+1:
+ subs w5, w5, #2
+ ld1 {v16.s}[0], [x2], x3
+ ld1 {v16.s}[1], [x2], x3
+ \op v16.8b, v16.8b, v1.8b
+ st1 {v16.s}[0], [x0], x1
+ st1 {v16.s}[1], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+weight_simple offsetadd, uqadd
+weight_simple offsetsub, uqsub
+
+
+// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
+function x264_mc_copy_w4_neon, export=1
+1:
+ subs w4, w4, #4
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x2], x3
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ st1 {v2.s}[0], [x0], x1
+ st1 {v3.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_copy_w8_neon, export=1
+1: subs w4, w4, #4
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v2.8b}, [x2], x3
+ ld1 {v3.8b}, [x2], x3
+ st1 {v0.8b}, [x0], x1
+ st1 {v1.8b}, [x0], x1
+ st1 {v2.8b}, [x0], x1
+ st1 {v3.8b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+function x264_mc_copy_w16_neon, export=1
+1: subs w4, w4, #4
+ ld1 {v0.16b}, [x2], x3
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v3.16b}, [x2], x3
+ st1 {v0.16b}, [x0], x1
+ st1 {v1.16b}, [x0], x1
+ st1 {v2.16b}, [x0], x1
+ st1 {v3.16b}, [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
+// intptr_t i_dst_stride,
+// uint8_t *src, intptr_t i_src_stride,
+// int dx, int dy, int i_width, int i_height );
+function x264_mc_chroma_neon, export=1
+ ldr w15, [sp] // height
+ sbfx x12, x6, #3, #29 // asr(3) and sign extend
+ sbfx x11, x5, #3, #29 // asr(3) and sign extend
+ cmp w7, #4
+ mul x12, x12, x4
+ add x3, x3, x11, lsl #1
+
+ and w5, w5, #7
+ and w6, w6, #7
+
+ add x3, x3, x12
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ b.gt mc_chroma_w8_neon
+ b.eq mc_chroma_w4_neon
+endfunc
+
+.macro CHROMA_MC_START r00, r01, r10, r11
+ mul w12, w5, w6 // cD = d8x *d8y
+ lsl w13, w5, #3
+ add w9, w12, #64
+ lsl w14, w6, #3
+ tst w12, w12
+ sub w9, w9, w13
+ sub w10, w13, w12 // cB = d8x *(8-d8y);
+ sub w11, w14, w12 // cC = (8-d8x)*d8y
+ sub w9, w9, w14 // cA = (8-d8x)*(8-d8y);
+.endm
+
+.macro CHROMA_MC width, vsize
+function mc_chroma_w\width\()_neon
+// since the element size varies, there's a different index for the 2nd store
+.if \width == 4
+ .set st2, 1
+.else
+ .set st2, 2
+.endif
+ CHROMA_MC_START
+ b.eq 2f
+
+ ld2 {v28.8b,v29.8b}, [x3], x4
+ dup v0.8b, w9 // cA
+ dup v1.8b, w10 // cB
+
+ ext v6.8b, v28.8b, v6.8b, #1
+ ext v7.8b, v29.8b, v7.8b, #1
+
+ ld2 {v30.8b,v31.8b}, [x3], x4
+ dup v2.8b, w11 // cC
+ dup v3.8b, w12 // cD
+
+ ext v22.8b, v30.8b, v22.8b, #1
+ ext v23.8b, v31.8b, v23.8b, #1
+
+ trn1 v0.2s, v0.2s, v1.2s
+ trn1 v2.2s, v2.2s, v3.2s
+
+ trn1 v4.2s, v28.2s, v6.2s
+ trn1 v5.2s, v29.2s, v7.2s
+ trn1 v20.2s, v30.2s, v22.2s
+ trn1 v21.2s, v31.2s, v23.2s
+1: // height loop, interpolate xy
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v20.8b, v2.8b
+ umull v17.8h, v5.8b, v0.8b
+ umlal v17.8h, v21.8b, v2.8b
+
+ ld2 {v28.8b,v29.8b}, [x3], x4
+ transpose v24.2d, v25.2d, v16.2d, v17.2d
+
+ ext v6.8b, v28.8b, v6.8b, #1
+ ext v7.8b, v29.8b, v7.8b, #1
+
+ trn1 v4.2s, v28.2s, v6.2s
+ trn1 v5.2s, v29.2s, v7.2s
+
+ add v16.8h, v24.8h, v25.8h
+
+ umull v18.8h, v20.8b, v0.8b
+ umlal v18.8h, v4.8b, v2.8b
+ umull v19.8h, v21.8b, v0.8b
+ umlal v19.8h, v5.8b, v2.8b
+
+ ld2 {v30.8b,v31.8b}, [x3], x4
+ transpose v26.2d, v27.2d, v18.2d, v19.2d
+
+ ext v22.8b, v30.8b, v22.8b, #1
+ ext v23.8b, v31.8b, v23.8b, #1
+ trn1 v20.2s, v30.2s, v22.2s
+ trn1 v21.2s, v31.2s, v23.2s
+
+ add v17.8h, v26.8h, v27.8h
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.\vsize}[0], [x0], x2
+ st1 {v16.\vsize}[st2], [x1], x2
+ st1 {v17.\vsize}[0], [x0], x2
+ st1 {v17.\vsize}[st2], [x1], x2
+ b.gt 1b
+
+ ret
+2: // dx or dy are 0
+ tst w11, w11
+ add w10, w10, w11
+ dup v0.8b, w9
+ dup v1.8b, w10
+
+ b.eq 4f
+
+ ld1 {v4.8b}, [x3], x4
+ ld1 {v6.8b}, [x3], x4
+3: // vertical interpolation loop
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b
+ ld1 {v4.8b}, [x3], x4
+ umlal v16.8h, v6.8b, v1.8b
+ umull v17.8h, v6.8b, v0.8b
+ ld1 {v6.8b}, [x3], x4
+ umlal v17.8h, v4.8b, v1.8b
+
+ rshrn v20.8b, v16.8h, #6 // uvuvuvuv
+ rshrn v21.8b, v17.8h, #6 // uvuvuvuv
+
+ uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
+ uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.\vsize}[0], [x0], x2
+ st1 {v16.\vsize}[st2], [x0], x2
+ st1 {v17.\vsize}[0], [x1], x2
+ st1 {v17.\vsize}[st2], [x1], x2
+ b.gt 3b
+
+ ret
+
+4: // dy is 0
+ ld1 {v4.8b,v5.8b}, [x3], x4
+ ld1 {v6.8b,v7.8b}, [x3], x4
+
+ ext v5.8b, v4.8b, v5.8b, #2
+ ext v7.8b, v6.8b, v7.8b, #2
+5: // horizontal interpolation loop
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v5.8b, v1.8b
+ umull v17.8h, v6.8b, v0.8b
+ umlal v17.8h, v7.8b, v1.8b
+
+ ld1 {v4.8b,v5.8b}, [x3], x4
+ ld1 {v6.8b,v7.8b}, [x3], x4
+ rshrn v20.8b, v16.8h, #6
+ rshrn v21.8b, v17.8h, #6
+ ext v5.8b, v4.8b, v5.8b, #2
+ ext v7.8b, v6.8b, v7.8b, #2
+ uzp1 v16.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
+ uzp2 v17.8b, v20.8b, v21.8b // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.\vsize}[0], [x0], x2
+ st1 {v16.\vsize}[st2], [x0], x2
+ st1 {v17.\vsize}[0], [x1], x2
+ st1 {v17.\vsize}[st2], [x1], x2
+ b.gt 5b
+
+ ret
+endfunc
+.endm
+
+ CHROMA_MC 2, h
+ CHROMA_MC 4, s
+
+function mc_chroma_w8_neon
+ CHROMA_MC_START
+ b.eq 2f
+ ld2 {v4.16b,v5.16b}, [x3], x4
+ ld2 {v20.16b,v21.16b}, [x3], x4
+ dup v0.8b, w9 // cA
+ dup v1.8b, w10 // cB
+
+ ext v6.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v5.16b, v5.16b, #1
+
+ dup v2.8b, w11 // cC
+ dup v3.8b, w12 // cD
+
+ ext v22.16b, v20.16b, v20.16b, #1
+ ext v23.16b, v21.16b, v21.16b, #1
+
+1: // height loop, interpolate xy
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b
+ umlal v16.8h, v6.8b, v1.8b
+ umlal v16.8h, v20.8b, v2.8b
+ umlal v16.8h, v22.8b, v3.8b
+
+ umull v17.8h, v5.8b, v0.8b
+ umlal v17.8h, v7.8b, v1.8b
+ umlal v17.8h, v21.8b, v2.8b
+ umlal v17.8h, v23.8b, v3.8b
+
+ ld2 {v4.16b,v5.16b}, [x3], x4
+
+ ext v6.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v5.16b, v5.16b, #1
+
+ umull v18.8h, v20.8b, v0.8b
+ umlal v18.8h, v22.8b, v1.8b
+ umlal v18.8h, v4.8b, v2.8b
+ umlal v18.8h, v6.8b, v3.8b
+
+ umull v19.8h, v21.8b, v0.8b
+ umlal v19.8h, v23.8b, v1.8b
+ umlal v19.8h, v5.8b, v2.8b
+ umlal v19.8h, v7.8b, v3.8b
+
+ ld2 {v20.16b,v21.16b}, [x3], x4
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ rshrn v18.8b, v18.8h, #6
+ rshrn v19.8b, v19.8h, #6
+
+ ext v22.16b, v20.16b, v20.16b, #1
+ ext v23.16b, v21.16b, v21.16b, #1
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x1], x2
+ st1 {v18.8b}, [x0], x2
+ st1 {v19.8b}, [x1], x2
+ b.gt 1b
+
+ ret
+2: // dx or dy are 0
+ tst w11, w11
+ add w10, w10, w11
+ dup v0.8b, w9
+ dup v1.8b, w10
+
+ b.eq 4f
+
+ ld2 {v4.8b,v5.8b}, [x3], x4
+ ld2 {v6.8b,v7.8b}, [x3], x4
+3: // vertical interpolation loop
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b //U
+ umlal v16.8h, v6.8b, v1.8b
+ umull v17.8h, v5.8b, v0.8b //V
+ umlal v17.8h, v7.8b, v1.8b
+
+ ld2 {v4.8b,v5.8b}, [x3], x4
+
+ umull v18.8h, v6.8b, v0.8b
+ umlal v18.8h, v4.8b, v1.8b
+ umull v19.8h, v7.8b, v0.8b
+ umlal v19.8h, v5.8b, v1.8b
+
+ ld2 {v6.8b,v7.8b}, [x3], x4
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ rshrn v18.8b, v18.8h, #6
+ rshrn v19.8b, v19.8h, #6
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x1], x2
+ st1 {v18.8b}, [x0], x2
+ st1 {v19.8b}, [x1], x2
+ b.gt 3b
+
+ ret
+4: // dy is 0
+ ld2 {v4.16b,v5.16b}, [x3], x4
+ ext v6.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v5.16b, v5.16b, #1
+ ld2 {v20.16b,v21.16b}, [x3], x4
+ ext v22.16b, v20.16b, v20.16b, #1
+ ext v23.16b, v21.16b, v21.16b, #1
+5: // horizontal interpolation loop
+ subs w15, w15, #2
+ umull v16.8h, v4.8b, v0.8b //U
+ umlal v16.8h, v6.8b, v1.8b
+ umull v17.8h, v5.8b, v0.8b //V
+ umlal v17.8h, v7.8b, v1.8b
+
+ ld2 {v4.16b,v5.16b}, [x3], x4
+
+ umull v18.8h, v20.8b, v0.8b
+ umlal v18.8h, v22.8b, v1.8b
+ umull v19.8h, v21.8b, v0.8b
+ umlal v19.8h, v23.8b, v1.8b
+
+ ld2 {v20.16b,v21.16b}, [x3], x4
+
+ rshrn v16.8b, v16.8h, #6
+ rshrn v17.8b, v17.8h, #6
+ rshrn v18.8b, v18.8h, #6
+ rshrn v19.8b, v19.8h, #6
+
+ ext v6.16b, v4.16b, v4.16b, #1
+ ext v7.16b, v5.16b, v5.16b, #1
+ ext v22.16b, v20.16b, v20.16b, #1
+ ext v23.16b, v21.16b, v21.16b, #1
+
+ //pld [x3]
+ //pld [x3, x4]
+
+ st1 {v16.8b}, [x0], x2
+ st1 {v17.8b}, [x1], x2
+ st1 {v18.8b}, [x0], x2
+ st1 {v19.8b}, [x1], x2
+ b.gt 5b
+
+ ret
+endfunc
+
+function x264_hpel_filter_neon, export=1
+ ubfm x9, x3, #0, #4
+ add w15, w5, w9
+ sub x13, x3, x9
+ sub x10, x0, x9
+ sub x11, x1, x9
+ sub x12, x2, x9
+1:
+ sub x3, x13, #16
+ mov x2, x12
+ mov x1, x11
+ ld1 {v7.16b}, [x3], #16
+ mov x0, x10
+ add x7, x3, #16 // src pointer next 16b for horiz filter
+ mov x5, x15
+ sub x3, x3, x4, lsl #1
+ movi v30.16b, #5
+ ld1 {v28.16b}, [x7], #16
+ movi v31.16b, #20
+ add x9, x3, w5, uxtw
+
+ ld1 {v16.16b}, [x3], x4
+ ld1 {v17.16b}, [x3], x4
+ ld1 {v18.16b}, [x3], x4
+ ld1 {v19.16b}, [x3], x4
+ ld1 {v20.16b}, [x3], x4
+ ld1 {v21.16b}, [x3], x4
+
+ ext v22.16b, v7.16b, v18.16b, #14
+ uaddl v1.8h, v16.8b, v21.8b
+ ext v26.16b, v18.16b, v28.16b, #3
+ umlsl v1.8h, v17.8b, v30.8b
+ ext v23.16b, v7.16b, v18.16b, #15
+ umlal v1.8h, v18.8b, v31.8b
+ ext v24.16b, v18.16b, v28.16b, #1
+ umlal v1.8h, v19.8b, v31.8b
+ ext v25.16b, v18.16b, v28.16b, #2
+ umlsl v1.8h, v20.8b, v30.8b
+2:
+ subs w5, w5, #16
+ sub x3, x9, w5, sxtw
+
+ uaddl v4.8h, v22.8b, v26.8b
+ uaddl2 v5.8h, v22.16b, v26.16b
+ sqrshrun v6.8b, v1.8h, #5
+ umlsl v4.8h, v23.8b, v30.8b
+ umlsl2 v5.8h, v23.16b, v30.16b
+ umlal v4.8h, v18.8b, v31.8b
+ umlal2 v5.8h, v18.16b, v31.16b
+ umlal v4.8h, v24.8b, v31.8b
+ umlal2 v5.8h, v24.16b, v31.16b
+ umlsl v4.8h, v25.8b, v30.8b
+ umlsl2 v5.8h, v25.16b, v30.16b
+
+ uaddl2 v2.8h, v16.16b, v21.16b
+ sqrshrun v4.8b, v4.8h, #5
+ mov v7.16b, v18.16b
+ sqrshrun2 v4.16b, v5.8h, #5
+
+ umlsl2 v2.8h, v17.16b, v30.16b
+ ld1 {v16.16b}, [x3], x4
+ umlal2 v2.8h, v18.16b, v31.16b
+ ld1 {v17.16b}, [x3], x4
+ umlal2 v2.8h, v19.16b, v31.16b
+ ld1 {v18.16b}, [x3], x4
+ umlsl2 v2.8h, v20.16b, v30.16b
+ ld1 {v19.16b}, [x3], x4
+ st1 {v4.16b}, [x0], #16
+ sqrshrun2 v6.16b, v2.8h, #5
+ ld1 {v20.16b}, [x3], x4
+ ld1 {v21.16b}, [x3], x4
+
+ ext v22.16b, v0.16b, v1.16b, #12
+ ext v26.16b, v1.16b, v2.16b, #6
+ ext v23.16b, v0.16b, v1.16b, #14
+ st1 {v6.16b}, [x1], #16
+ uaddl v3.8h, v16.8b, v21.8b
+ ext v25.16b, v1.16b, v2.16b, #4
+ umlsl v3.8h, v17.8b, v30.8b
+ ext v24.16b, v1.16b, v2.16b, #2
+
+ umlal v3.8h, v18.8b, v31.8b
+ add v4.8h, v22.8h, v26.8h
+ umlal v3.8h, v19.8b, v31.8b
+ add v5.8h, v23.8h, v25.8h
+ umlsl v3.8h, v20.8b, v30.8b
+ add v6.8h, v24.8h, v1.8h
+
+ ext v22.16b, v1.16b, v2.16b, #12
+ ext v26.16b, v2.16b, v3.16b, #6
+ ext v23.16b, v1.16b, v2.16b, #14
+ ext v25.16b, v2.16b, v3.16b, #4
+ ext v24.16b, v2.16b, v3.16b, #2
+
+ add v22.8h, v22.8h, v26.8h
+ add v23.8h, v23.8h, v25.8h
+ add v24.8h, v24.8h, v2.8h
+
+ sub v4.8h, v4.8h, v5.8h // a-b
+ sub v5.8h, v5.8h, v6.8h // b-c
+
+ sub v22.8h, v22.8h, v23.8h // a-b
+ sub v23.8h, v23.8h, v24.8h // b-c
+
+ sshr v4.8h, v4.8h, #2 // (a-b)/4
+ sshr v22.8h, v22.8h, #2 // (a-b)/4
+ sub v4.8h, v4.8h, v5.8h // (a-b)/4-b+c
+ sub v22.8h, v22.8h, v23.8h // (a-b)/4-b+c
+ sshr v4.8h, v4.8h, #2 // ((a-b)/4-b+c)/4
+ sshr v22.8h, v22.8h, #2 // ((a-b)/4-b+c)/4
+ add v4.8h, v4.8h, v6.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+ add v22.8h, v22.8h, v24.8h // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+ sqrshrun v4.8b, v4.8h, #6
+ ld1 {v28.16b}, [x7], #16
+ mov v0.16b, v2.16b
+ ext v23.16b, v7.16b, v18.16b, #15
+ sqrshrun2 v4.16b, v22.8h, #6
+ mov v1.16b, v3.16b
+ ext v22.16b, v7.16b, v18.16b, #14
+ ext v24.16b, v18.16b, v28.16b, #1
+ ext v25.16b, v18.16b, v28.16b, #2
+ ext v26.16b, v18.16b, v28.16b, #3
+
+ st1 {v4.16b}, [x2], #16
+ b.gt 2b
+
+ subs w6, w6, #1
+ add x10, x10, x4
+ add x11, x11, x4
+ add x12, x12, x4
+ add x13, x13, x4
+ b.gt 1b
+
+ ret
+endfunc
+
+// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
+// uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
+// intptr_t dst_stride, int width, int height )
+function x264_frame_init_lowres_core_neon, export=1
+ ldr w8, [sp]
+ sub x10, x6, w7, uxtw // dst_stride - width
+ and x10, x10, #~15
+
+1:
+ mov w9, w7 // width
+ mov x11, x0 // src0
+ add x12, x0, x5 // src1 = src0 + src_stride
+ add x13, x0, x5, lsl #1 // src2 = src1 + src_stride
+
+ ld2 {v0.16b,v1.16b}, [x11], #32
+ ld2 {v2.16b,v3.16b}, [x12], #32
+ ld2 {v4.16b,v5.16b}, [x13], #32
+
+ urhadd v20.16b, v0.16b, v2.16b // s0[2x] + s1[2x]
+ urhadd v22.16b, v2.16b, v4.16b // s1[2x] + s2[2x]
+2:
+ subs w9, w9, #16
+ urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
+ urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
+
+ ld2 {v0.16b,v1.16b}, [x11], #32
+ ld2 {v2.16b,v3.16b}, [x12], #32
+ ld2 {v4.16b,v5.16b}, [x13], #32
+ urhadd v30.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
+ urhadd v31.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
+ ext v24.16b, v20.16b, v30.16b, #1 // s0[2x+2] + s1[2x+2]
+ ext v25.16b, v22.16b, v31.16b, #1 // s1[2x+2] + s2[2x+2]
+
+ urhadd v16.16b, v20.16b, v21.16b
+ urhadd v18.16b, v22.16b, v23.16b
+ urhadd v17.16b, v21.16b, v24.16b
+ urhadd v19.16b, v23.16b, v25.16b
+
+ st1 {v16.16b}, [x1], #16
+ st1 {v18.16b}, [x3], #16
+ st1 {v17.16b}, [x2], #16
+ st1 {v19.16b}, [x4], #16
+ b.le 3f
+
+ subs w9, w9, #16
+ urhadd v21.16b, v1.16b, v3.16b // s0[2x+1] + s1[2x+1]
+ urhadd v23.16b, v3.16b, v5.16b // s1[2x+1] + s2[2x+1]
+
+ ld2 {v0.16b,v1.16b}, [x11], #32
+ ld2 {v2.16b,v3.16b}, [x12], #32
+ ld2 {v4.16b,v5.16b}, [x13], #32
+ urhadd v20.16b, v0.16b, v2.16b // loop: s0[2x] + s1[2x]
+ urhadd v22.16b, v2.16b, v4.16b // loop: s1[2x] + s2[2x]
+ ext v24.16b, v30.16b, v20.16b, #1 // s0[2x+2] + s1[2x+2]
+ ext v25.16b, v31.16b, v22.16b, #1 // s1[2x+2] + s2[2x+2]
+
+ urhadd v16.16b, v30.16b, v21.16b
+ urhadd v18.16b, v31.16b, v23.16b
+ urhadd v17.16b, v21.16b, v24.16b
+ urhadd v19.16b, v23.16b, v25.16b
+
+ st1 {v16.16b}, [x1], #16
+ st1 {v18.16b}, [x3], #16
+ st1 {v17.16b}, [x2], #16
+ st1 {v19.16b}, [x4], #16
+ b.gt 2b
+3:
+ subs w8, w8, #1
+ add x0, x0, x5, lsl #1
+ add x1, x1, x10
+ add x2, x2, x10
+ add x3, x3, x10
+ add x4, x4, x10
+ b.gt 1b
+
+ ret
+endfunc
+
+function x264_load_deinterleave_chroma_fenc_neon, export=1
+ mov x4, #FENC_STRIDE/2
+ b load_deinterleave_chroma
+endfunc
+
+function x264_load_deinterleave_chroma_fdec_neon, export=1
+ mov x4, #FDEC_STRIDE/2
+load_deinterleave_chroma:
+ ld2 {v0.8b,v1.8b}, [x1], x2
+ ld2 {v2.8b,v3.8b}, [x1], x2
+ subs w3, w3, #2
+ st1 {v0.8b}, [x0], x4
+ st1 {v1.8b}, [x0], x4
+ st1 {v2.8b}, [x0], x4
+ st1 {v3.8b}, [x0], x4
+ b.gt load_deinterleave_chroma
+
+ ret
+endfunc
+
+function x264_plane_copy_deinterleave_neon, export=1
+ add w9, w6, #15
+ and w9, w9, #0xfffffff0
+ sub x1, x1, x9
+ sub x3, x3, x9
+ sub x5, x5, x9, lsl #1
+1:
+ ld2 {v0.16b,v1.16b}, [x4], #32
+ subs w9, w9, #16
+ st1 {v0.16b}, [x0], #16
+ st1 {v1.16b}, [x2], #16
+ b.gt 1b
+
+ add x4, x4, x5
+ subs w7, w7, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ mov w9, w6
+ b.gt 1b
+
+ ret
+endfunc
+
+function x264_plane_copy_deinterleave_rgb_neon, export=1
+ ldr x8, [sp]
+ ldp x9, x10, [sp, #8]
+ cmp w8, #3
+ uxtw x9, w9
+ add x11, x9, #7
+ and x11, x11, #~7
+ sub x1, x1, x11
+ sub x3, x3, x11
+ sub x5, x5, x11
+ b.ne 4f
+ sub x7, x7, x11, lsl #1
+ sub x7, x7, x11
+block3:
+ ld3 {v0.8b,v1.8b,v2.8b}, [x6], #24
+ subs x11, x11, #8
+ st1 {v0.8b}, [x0], #8
+ st1 {v1.8b}, [x2], #8
+ st1 {v2.8b}, [x4], #8
+ b.gt block3
+
+ subs w10, w10, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ add x4, x4, x5
+ add x6, x6, x7
+ mov x11, x9
+ b.gt block3
+
+ ret
+4:
+ sub x7, x7, x11, lsl #2
+block4:
+ ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
+ subs x11, x11, #8
+ st1 {v0.8b}, [x0], #8
+ st1 {v1.8b}, [x2], #8
+ st1 {v2.8b}, [x4], #8
+ b.gt block4
+
+ subs w10, w10, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ add x4, x4, x5
+ add x6, x6, x7
+ mov x11, x9
+ b.gt block4
+
+ ret
+endfunc
+
+function x264_plane_copy_interleave_neon, export=1
+ add w9, w6, #15
+ and w9, w9, #0xfffffff0
+ sub x1, x1, x9, lsl #1
+ sub x3, x3, x9
+ sub x5, x5, x9
+1:
+ ld1 {v0.16b}, [x2], #16
+ ld1 {v1.16b}, [x4], #16
+ subs w9, w9, #16
+ st2 {v0.16b,v1.16b}, [x0], #32
+ b.gt 1b
+
+ subs w7, w7, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ add x4, x4, x5
+ mov w9, w6
+ b.gt 1b
+
+ ret
+endfunc
+
+function x264_store_interleave_chroma_neon, export=1
+ mov x5, #FDEC_STRIDE
+1:
+ ld1 {v0.8b}, [x2], x5
+ ld1 {v1.8b}, [x3], x5
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v3.8b}, [x3], x5
+ subs w4, w4, #2
+ zip1 v4.16b, v0.16b, v1.16b
+ zip1 v5.16b, v2.16b, v3.16b
+ st1 {v4.16b}, [x0], x1
+ st1 {v5.16b}, [x0], x1
+ b.gt 1b
+
+ ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
new file mode 100644
index 0000000..5554268
--- /dev/null
+++ b/common/aarch64/mc-c.c
@@ -0,0 +1,253 @@
+/*****************************************************************************
+ * mc-c.c: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x2_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+void x264_plane_copy_deinterleave_neon( pixel *dstu, intptr_t i_dstu,
+ pixel *dstv, intptr_t i_dstv,
+ pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+ pixel *dstb, intptr_t i_dstb,
+ pixel *dstc, intptr_t i_dstc,
+ pixel *src, intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_interleave_neon( pixel *dst, intptr_t i_dst,
+ pixel *srcu, intptr_t i_srcu,
+ pixel *srcv, intptr_t i_srcv, int w, int h );
+
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+\
+static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
+{\
+ x264_mc_weight_w4##func##_neon,\
+ x264_mc_weight_w4##func##_neon,\
+ x264_mc_weight_w8##func##_neon,\
+ x264_mc_weight_w16##func##_neon,\
+ x264_mc_weight_w16##func##_neon,\
+ x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+#if !HIGH_BIT_DEPTH
+static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+ if( w->i_scale == 1<<w->i_denom )
+ {
+ if( w->i_offset < 0 )
+ {
+ w->weightfn = x264_mc_offsetsub_wtab_neon;
+ w->cachea[0] = -w->i_offset;
+ }
+ else
+ {
+ w->weightfn = x264_mc_offsetadd_wtab_neon;
+ w->cachea[0] = w->i_offset;
+ }
+ }
+ else if( !w->i_denom )
+ w->weightfn = x264_mc_nodenom_wtab_neon;
+ else
+ w->weightfn = x264_mc_wtab_neon;
+}
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
+{
+ NULL,
+ x264_pixel_avg2_w4_neon,
+ x264_pixel_avg2_w8_neon,
+ x264_pixel_avg2_w16_neon, // no slower than w12, so no point in a separate function
+ x264_pixel_avg2_w16_neon,
+ x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
+{
+ NULL,
+ x264_mc_copy_w4_neon,
+ x264_mc_copy_w8_neon,
+ NULL,
+ x264_mc_copy_w16_neon,
+};
+
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst, intptr_t i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+ }
+ else if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+ else
+ x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst, intptr_t *i_dst_stride,
+ uint8_t *src[4], intptr_t i_src_stride,
+ int mvx, int mvy,
+ int i_width, int i_height, const x264_weight_t *weight )
+{
+ int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+ intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+ uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+ if ( (mvy&3) == 3 ) // explict if() to force conditional add
+ src1 += i_src_stride;
+
+ if( qpel_idx & 5 ) /* qpel interpolation needed */
+ {
+ uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+ x264_pixel_avg_wtab_neon[i_width>>2](
+ dst, *i_dst_stride, src1, i_src_stride,
+ src2, i_height );
+ if( weight->weightfn )
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+ return dst;
+ }
+ else if( weight->weightfn )
+ {
+ weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+ return dst;
+ }
+ else
+ {
+ *i_dst_stride = i_src_stride;
+ return src1;
+ }
+}
+
+void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+ uint8_t *src, intptr_t stride, int width,
+ int height, int16_t *buf );
+#endif // !HIGH_BIT_DEPTH
+
+void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
+{
+#if !HIGH_BIT_DEPTH
+ if( cpu&X264_CPU_ARMV8 )
+ {
+ pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
+ pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
+ pf->prefetch_ref = x264_prefetch_ref_aarch64;
+ }
+
+ if( !(cpu&X264_CPU_NEON) )
+ return;
+
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
+ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
+ pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
+
+ pf->weight = x264_mc_wtab_neon;
+ pf->offsetadd = x264_mc_offsetadd_wtab_neon;
+ pf->offsetsub = x264_mc_offsetsub_wtab_neon;
+ pf->weight_cache = x264_weight_cache_neon;
+
+ pf->mc_chroma = x264_mc_chroma_neon;
+ pf->mc_luma = mc_luma_neon;
+ pf->get_ref = get_ref_neon;
+ pf->hpel_filter = x264_hpel_filter_neon;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/aarch64/mc.h b/common/aarch64/mc.h
new file mode 100644
index 0000000..feba321
--- /dev/null
+++ b/common/aarch64/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_MC_H
+#define X264_AARCH64_MC_H
+
+void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf );
+
+#endif
diff --git a/common/mc.c b/common/mc.c
index 6797f0a..6a8b1b8 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -35,6 +35,9 @@
#if ARCH_ARM
#include "arm/mc.h"
#endif
+#if ARCH_AARCH64
+#include "aarch64/mc.h"
+#endif
static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride,
@@ -641,6 +644,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
#if HAVE_ARMV6
x264_mc_init_arm( cpu, pf );
#endif
+#if ARCH_AARCH64
+ x264_mc_init_aarch64( cpu, pf );
+#endif
if( cpu_independent )
{
--
2.0.0
More information about the x264-devel
mailing list