[x265] [PATCH] Add aarch64 support - Part 1
Xiyuan Wang
wangxiyuan1007 at gmail.com
Thu Feb 27 03:31:39 CET 2020
From: wangxiyuan <wangxiyuan at huawei.com>
This patch add some common assembly optimization function for aarch64
platform. These function won't work until the patch Part 2 is merged.
---
source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++
source/common/aarch64/asm.S | 69 ++++
source/common/aarch64/ipfilter8.S | 414 ++++++++++++++++++++++
source/common/aarch64/ipfilter8.h | 55 +++
source/common/aarch64/mc-a.S | 63 ++++
source/common/aarch64/pixel-util.S | 419 +++++++++++++++++++++++
source/common/aarch64/pixel-util.h | 40 +++
source/common/aarch64/pixel.h | 105 ++++++
source/common/aarch64/sad-a.S | 105 ++++++
9 files changed, 1489 insertions(+)
create mode 100644 source/common/aarch64/asm-primitives.cpp
create mode 100644 source/common/aarch64/asm.S
create mode 100644 source/common/aarch64/ipfilter8.S
create mode 100644 source/common/aarch64/ipfilter8.h
create mode 100644 source/common/aarch64/mc-a.S
create mode 100644 source/common/aarch64/pixel-util.S
create mode 100644 source/common/aarch64/pixel-util.h
create mode 100644 source/common/aarch64/pixel.h
create mode 100644 source/common/aarch64/sad-a.S
diff --git a/source/common/aarch64/asm-primitives.cpp
b/source/common/aarch64/asm-primitives.cpp
new file mode 100644
index 000000000..6fe8c968c
--- /dev/null
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -0,0 +1,219 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
+ * Yimeng Su <yimeng.su at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "x265.h"
+#include "cpu.h"
+
+
+#if defined(__GNUC__)
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +
__GNUC_PATCHLEVEL__)
+#endif
+
+#define GCC_4_9_0 40900
+#define GCC_5_1_0 50100
+
+extern "C" {
+#include "pixel.h"
+#include "pixel-util.h"
+#include "ipfilter8.h"
+}
+
+namespace X265_NS {
+// private x265 namespace
+
+
+template<int size>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel*
dst, intptr_t dstStride, int idxX, int idxY)
+{
+ ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA -
1)]);
+ const int halfFilterSize = NTAPS_LUMA >> 1;
+ const int immedStride = MAX_CU_SIZE;
+
+ primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX,
1);
+ primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) *
immedStride, immedStride, dst, dstStride, idxY);
+}
+
+
+/* Temporary workaround because luma_vsp assembly primitive has not been
completed
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.
+ * Otherwise, segment fault occurs. */
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp,
int cpuMask)
+{
+ if (cpuMask & X265_CPU_NEON)
+ {
+ asmp.pu[LUMA_8x4].luma_vsp = cp.pu[LUMA_8x4].luma_vsp;
+ asmp.pu[LUMA_8x8].luma_vsp = cp.pu[LUMA_8x8].luma_vsp;
+ asmp.pu[LUMA_8x16].luma_vsp = cp.pu[LUMA_8x16].luma_vsp;
+ asmp.pu[LUMA_8x32].luma_vsp = cp.pu[LUMA_8x32].luma_vsp;
+ asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
*/
+ asmp.pu[LUMA_16x4].luma_vsp = cp.pu[LUMA_16x4].luma_vsp;
+ asmp.pu[LUMA_16x8].luma_vsp = cp.pu[LUMA_16x8].luma_vsp;
+ asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
+ asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
+ asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
+ asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
+ asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
+ asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
+ asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
+ asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
+ asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
+ asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
+ asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
+ asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
+ asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
*/
+ asmp.pu[LUMA_4x4].luma_vsp = cp.pu[LUMA_4x4].luma_vsp;
+ asmp.pu[LUMA_4x8].luma_vsp = cp.pu[LUMA_4x8].luma_vsp;
+ asmp.pu[LUMA_4x16].luma_vsp = cp.pu[LUMA_4x16].luma_vsp;
+ asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
+ asmp.pu[LUMA_32x8].luma_vsp = cp.pu[LUMA_32x8].luma_vsp;
+#endif
+#endif
+ }
+}
+
+
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
+{
+ if (cpuMask & X265_CPU_NEON)
+ {
+ p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_neon);
+ p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
+ p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);
+ p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_neon);
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);
+ p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd =
PFX(pixel_satd_4x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
PFX(pixel_satd_4x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
PFX(pixel_satd_4x16_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
PFX(pixel_satd_8x4_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd =
PFX(pixel_satd_8x8_neon);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd =
PFX(pixel_satd_12x16_neon);
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd =
PFX(pixel_satd_4x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd =
PFX(pixel_satd_4x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd =
PFX(pixel_satd_4x16_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd =
PFX(pixel_satd_4x32_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
PFX(pixel_satd_8x4_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd =
PFX(pixel_satd_8x8_neon);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd =
PFX(pixel_satd_12x32_neon);
+
+ p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_4x4_neon);
+ p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_4x8_neon);
+ p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_4x16_neon);
+ p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_8x4_neon);
+ p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_8x8_neon);
+ p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_8x16_neon);
+ p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] =
PFX(pixel_avg_pp_8x32_neon);
+
+ p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_4x4_neon);
+ p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_4x8_neon);
+ p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] =
PFX(pixel_avg_pp_4x16_neon);
+ p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x4_neon);
+ p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x8_neon);
+ p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] =
PFX(pixel_avg_pp_8x16_neon);
+ p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] =
PFX(pixel_avg_pp_8x32_neon);
+
+ p.pu[LUMA_8x4].sad_x3 = PFX(sad_x3_8x4_neon);
+ p.pu[LUMA_8x8].sad_x3 = PFX(sad_x3_8x8_neon);
+ p.pu[LUMA_8x16].sad_x3 = PFX(sad_x3_8x16_neon);
+ p.pu[LUMA_8x32].sad_x3 = PFX(sad_x3_8x32_neon);
+
+ p.pu[LUMA_8x4].sad_x4 = PFX(sad_x4_8x4_neon);
+ p.pu[LUMA_8x8].sad_x4 = PFX(sad_x4_8x8_neon);
+ p.pu[LUMA_8x16].sad_x4 = PFX(sad_x4_8x16_neon);
+ p.pu[LUMA_8x32].sad_x4 = PFX(sad_x4_8x32_neon);
+
+ // quant
+ p.quant = PFX(quant_neon);
+ // luma_hps
+ p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_neon);
+ p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_neon);
+ p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_neon);
+ p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_neon);
+ p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_neon);
+ p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_neon);
+ p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_neon);
+ p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
+ p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
*/
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_neon);
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_neon);
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_neon);
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
+#endif
+
+ p.pu[LUMA_8x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x4>;
+ p.pu[LUMA_8x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x8>;
+ p.pu[LUMA_8x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x16>;
+ p.pu[LUMA_8x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x32>;
+ p.pu[LUMA_12x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_12x16>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
*/
+ p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
+ p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
+ p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
+ p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
+ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
+ p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
+ p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
+ p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
+ p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
+ p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
+ p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
+ p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
+ p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
+ p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
+ p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
*/
+ p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
+ p.pu[LUMA_4x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x8>;
+ p.pu[LUMA_4x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x16>;
+ p.pu[LUMA_24x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_24x32>;
+ p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
+#endif
+#endif
+
+#if !HIGH_BIT_DEPTH
+ p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
+#endif // !HIGH_BIT_DEPTH
+
+ }
+}
+} // namespace X265_NS
diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
new file mode 100644
index 000000000..5f020a11a
--- /dev/null
+++ b/source/common/aarch64/asm.S
@@ -0,0 +1,69 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+.arch armv8-a
+
+#ifdef PREFIX
+#define EXTERN_ASM _
+#else
+#define EXTERN_ASM
+#endif
+
+#ifdef __ELF__
+#define ELF
+#else
+#define ELF @
+#endif
+
+#define HAVE_AS_FUNC 1
+
+#if HAVE_AS_FUNC
+#define FUNC
+#else
+#define FUNC @
+#endif
+
+.macro function name, export=1
+ .macro endfunc
+ELF .size \name, . - \name
+FUNC .endfunc
+ .purgem endfunc
+ .endm
+ .align 2
+.if \export == 1
+ .global EXTERN_ASM\name
+ELF .hidden EXTERN_ASM\name
+ELF .type EXTERN_ASM\name, %function
+FUNC .func EXTERN_ASM\name
+EXTERN_ASM\name:
+.else
+ELF .hidden \name
+ELF .type \name, %function
+FUNC .func \name
+\name:
+.endif
+.endm
+
+
+#define FENC_STRIDE 64
+#define FDEC_STRIDE 32
diff --git a/source/common/aarch64/ipfilter8.S
b/source/common/aarch64/ipfilter8.S
new file mode 100644
index 000000000..908c7db46
--- /dev/null
+++ b/source/common/aarch64/ipfilter8.S
@@ -0,0 +1,414 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+
+
+.macro qpel_filter_0_32b
+ movi v24.8h, #64
+ uxtl v19.8h, v5.8b
+ smull v17.4s, v19.4h, v24.4h
+ smull2 v18.4s, v19.8h, v24.8h
+.endm
+
+.macro qpel_filter_1_32b
+ movi v16.8h, #58
+ uxtl v19.8h, v5.8b
+ smull v17.4s, v19.4h, v16.4h
+ smull2 v18.4s, v19.8h, v16.8h
+
+ movi v24.8h, #10
+ uxtl v21.8h, v1.8b
+ smull v19.4s, v21.4h, v24.4h
+ smull2 v20.4s, v21.8h, v24.8h
+
+ movi v16.8h, #17
+ uxtl v23.8h, v2.8b
+ smull v21.4s, v23.4h, v16.4h
+ smull2 v22.4s, v23.8h, v16.8h
+
+ movi v24.8h, #5
+ uxtl v1.8h, v6.8b
+ smull v23.4s, v1.4h, v24.4h
+ smull2 v16.4s, v1.8h, v24.8h
+
+ sub v17.4s, v17.4s, v19.4s
+ sub v18.4s, v18.4s, v20.4s
+
+ uxtl v1.8h, v4.8b
+ sshll v19.4s, v1.4h, #2
+ sshll2 v20.4s, v1.8h, #2
+
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+
+ uxtl v1.8h, v0.8b
+ uxtl v2.8h, v3.8b
+ ssubl v21.4s, v2.4h, v1.4h
+ ssubl2 v22.4s, v2.8h, v1.8h
+
+ add v17.4s, v17.4s, v19.4s
+ add v18.4s, v18.4s, v20.4s
+ sub v21.4s, v21.4s, v23.4s
+ sub v22.4s, v22.4s, v16.4s
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+.endm
+
+.macro qpel_filter_2_32b
+ movi v16.4s, #11
+ uxtl v19.8h, v5.8b
+ uxtl v20.8h, v2.8b
+ saddl v17.4s, v19.4h, v20.4h
+ saddl2 v18.4s, v19.8h, v20.8h
+
+ uxtl v21.8h, v1.8b
+ uxtl v22.8h, v6.8b
+ saddl v19.4s, v21.4h, v22.4h
+ saddl2 v20.4s, v21.8h, v22.8h
+
+ mul v19.4s, v19.4s, v16.4s
+ mul v20.4s, v20.4s, v16.4s
+
+ movi v16.4s, #40
+ mul v17.4s, v17.4s, v16.4s
+ mul v18.4s, v18.4s, v16.4s
+
+ uxtl v21.8h, v4.8b
+ uxtl v22.8h, v3.8b
+ saddl v23.4s, v21.4h, v22.4h
+ saddl2 v16.4s, v21.8h, v22.8h
+
+ uxtl v1.8h, v0.8b
+ uxtl v2.8h, v7.8b
+ saddl v21.4s, v1.4h, v2.4h
+ saddl2 v22.4s, v1.8h, v2.8h
+
+ shl v23.4s, v23.4s, #2
+ shl v16.4s, v16.4s, #2
+
+ add v19.4s, v19.4s, v21.4s
+ add v20.4s, v20.4s, v22.4s
+ add v17.4s, v17.4s, v23.4s
+ add v18.4s, v18.4s, v16.4s
+ sub v17.4s, v17.4s, v19.4s
+ sub v18.4s, v18.4s, v20.4s
+.endm
+
+.macro qpel_filter_3_32b
+ movi v16.8h, #17
+ movi v24.8h, #5
+
+ uxtl v19.8h, v5.8b
+ smull v17.4s, v19.4h, v16.4h
+ smull2 v18.4s, v19.8h, v16.8h
+
+ uxtl v21.8h, v1.8b
+ smull v19.4s, v21.4h, v24.4h
+ smull2 v20.4s, v21.8h, v24.8h
+
+ movi v16.8h, #58
+ uxtl v23.8h, v2.8b
+ smull v21.4s, v23.4h, v16.4h
+ smull2 v22.4s, v23.8h, v16.8h
+
+ movi v24.8h, #10
+ uxtl v1.8h, v6.8b
+ smull v23.4s, v1.4h, v24.4h
+ smull2 v16.4s, v1.8h, v24.8h
+
+ sub v17.4s, v17.4s, v19.4s
+ sub v18.4s, v18.4s, v20.4s
+
+ uxtl v1.8h, v3.8b
+ sshll v19.4s, v1.4h, #2
+ sshll2 v20.4s, v1.8h, #2
+
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+
+ uxtl v1.8h, v4.8b
+ uxtl v2.8h, v7.8b
+ ssubl v21.4s, v1.4h, v2.4h
+ ssubl2 v22.4s, v1.8h, v2.8h
+
+ add v17.4s, v17.4s, v19.4s
+ add v18.4s, v18.4s, v20.4s
+ sub v21.4s, v21.4s, v23.4s
+ sub v22.4s, v22.4s, v16.4s
+ add v17.4s, v17.4s, v21.4s
+ add v18.4s, v18.4s, v22.4s
+.endm
+
+
+
+
+.macro vextin8
+ ld1 {v3.16b}, [x11], #16
+ mov v7.d[0], v3.d[1]
+ ext v0.8b, v3.8b, v7.8b, #1
+ ext v4.8b, v3.8b, v7.8b, #2
+ ext v1.8b, v3.8b, v7.8b, #3
+ ext v5.8b, v3.8b, v7.8b, #4
+ ext v2.8b, v3.8b, v7.8b, #5
+ ext v6.8b, v3.8b, v7.8b, #6
+ ext v3.8b, v3.8b, v7.8b, #7
+.endm
+
+
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro HPS_FILTER a b filterhps
+ mov w12, #8192
+ mov w6, w10
+ sub x3, x3, #\a
+ lsl x3, x3, #1
+ mov w9, #\a
+ cmp w9, #4
+ b.eq 14f
+ cmp w9, #12
+ b.eq 15f
+ b 7f
+14:
+ HPS_FILTER_4 \a \b \filterhps
+ b 10f
+15:
+ HPS_FILTER_12 \a \b \filterhps
+ b 10f
+7:
+ cmp w5, #0
+ b.eq 8f
+ cmp w5, #1
+ b.eq 9f
+8:
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
+ mov w7, #\a
+ lsr w7, w7, #3
+ mov x11, x0
+ sub x11, x11, #4
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ sub v18.4s, v18.4s, v16.4s
+ xtn v0.4h, v17.4s
+ xtn2 v0.8h, v18.4s
+ st1 {v0.8h}, [x2], #16
+ subs w7, w7, #1
+ sub x11, x11, #8
+ b.ne loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
+ subs w6, w6, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
+ b 10f
+9:
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
+ mov w7, #\a
+ lsr w7, w7, #3
+ mov x11, x0
+ sub x11, x11, #4
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ sub v18.4s, v18.4s, v16.4s
+ xtn v0.4h, v17.4s
+ xtn2 v0.8h, v18.4s
+ st1 {v0.8h}, [x2], #16
+ subs w7, w7, #1
+ sub x11, x11, #8
+ b.ne loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
+ subs w6, w6, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
+10:
+.endm
+
+.macro HPS_FILTER_4 w h filterhps
+ cmp w5, #0
+ b.eq 11f
+ cmp w5, #1
+ b.eq 12f
+11:
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
+ mov x11, x0
+ sub x11, x11, #4
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ xtn v0.4h, v17.4s
+ st1 {v0.4h}, [x2], #8
+ sub x11, x11, #8
+ subs w6, w6, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
+ b 13f
+12:
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
+ mov x11, x0
+ sub x11, x11, #4
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ xtn v0.4h, v17.4s
+ st1 {v0.4h}, [x2], #8
+ sub x11, x11, #8
+ subs w6, w6, #1
+ add x0, x0, x1
+ add x2, x2, x3
+ b.ne loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
+13:
+.endm
+
+.macro HPS_FILTER_12 w h filterhps
+ cmp w5, #0
+ b.eq 14f
+ cmp w5, #1
+ b.eq 15f
+14:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
+ mov x11, x0
+ sub x11, x11, #4
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ sub v18.4s, v18.4s, v16.4s
+ xtn v0.4h, v17.4s
+ xtn2 v0.8h, v18.4s
+ st1 {v0.8h}, [x2], #16
+ sub x11, x11, #8
+
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ xtn v0.4h, v17.4s
+ st1 {v0.4h}, [x2], #8
+ add x2, x2, x3
+ subs w6, w6, #1
+ add x0, x0, x1
+ b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
+ b 16f
+15:
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
+ mov x11, x0
+ sub x11, x11, #4
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ sub v18.4s, v18.4s, v16.4s
+ xtn v0.4h, v17.4s
+ xtn2 v0.8h, v18.4s
+ st1 {v0.8h}, [x2], #16
+ sub x11, x11, #8
+
+ vextin8
+ \filterhps
+ dup v16.4s, w12
+ sub v17.4s, v17.4s, v16.4s
+ xtn v0.4h, v17.4s
+ st1 {v0.4h}, [x2], #8
+ add x2, x2, x3
+ subs w6, w6, #1
+ add x0, x0, x1
+ b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
+16:
+.endm
+
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+.macro LUMA_HPS w h
+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
+ mov w10, #\h
+ cmp w5, #0
+ b.eq 6f
+ sub x0, x0, x1, lsl #2
+
+ add x0, x0, x1
+ add w10, w10, #7
+6:
+ cmp w4, #0
+ b.eq 0f
+ cmp w4, #1
+ b.eq 1f
+ cmp w4, #2
+ b.eq 2f
+ cmp w4, #3
+ b.eq 3f
+0:
+ HPS_FILTER \w \h qpel_filter_0_32b
+ b 5f
+1:
+ HPS_FILTER \w \h qpel_filter_1_32b
+ b 5f
+2:
+ HPS_FILTER \w \h qpel_filter_2_32b
+ b 5f
+3:
+ HPS_FILTER \w \h qpel_filter_3_32b
+ b 5f
+5:
+ ret
+endfunc
+.endm
+
+LUMA_HPS 4 4
+LUMA_HPS 4 8
+LUMA_HPS 4 16
+LUMA_HPS 8 4
+LUMA_HPS 8 8
+LUMA_HPS 8 16
+LUMA_HPS 8 32
+LUMA_HPS 12 16
+LUMA_HPS 16 4
+LUMA_HPS 16 8
+LUMA_HPS 16 12
+LUMA_HPS 16 16
+LUMA_HPS 16 32
+LUMA_HPS 16 64
+LUMA_HPS 24 32
+LUMA_HPS 32 8
+LUMA_HPS 32 16
+LUMA_HPS 32 24
+LUMA_HPS 32 32
+LUMA_HPS 32 64
+LUMA_HPS 48 64
+LUMA_HPS 64 16
+LUMA_HPS 64 32
+LUMA_HPS 64 48
+LUMA_HPS 64 64
diff --git a/source/common/aarch64/ipfilter8.h
b/source/common/aarch64/ipfilter8.h
new file mode 100644
index 000000000..f9ed91e2e
--- /dev/null
+++ b/source/common/aarch64/ipfilter8.h
@@ -0,0 +1,55 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#ifndef X265_IPFILTER8_AARCH64_H
+#define X265_IPFILTER8_AARCH64_H
+
+
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t
srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+
+#endif // ifndef X265_IPFILTER8_AARCH64_H
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
new file mode 100644
index 000000000..cbaf9b501
--- /dev/null
+++ b/source/common/aarch64/mc-a.S
@@ -0,0 +1,63 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro pixel_avg_pp_4xN_neon h
+function x265_pixel_avg_pp_4x\h\()_neon
+.rept \h
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v1.s}[0], [x4], x5
+ urhadd v2.8b, v0.8b, v1.8b
+ st1 {v2.s}[0], [x0], x1
+.endr
+ ret
+endfunc
+.endm
+
+pixel_avg_pp_4xN_neon 4
+pixel_avg_pp_4xN_neon 8
+pixel_avg_pp_4xN_neon 16
+
+.macro pixel_avg_pp_8xN_neon h
+function x265_pixel_avg_pp_8x\h\()_neon
+.rept \h
+ ld1 {v0.8b}, [x2], x3
+ ld1 {v1.8b}, [x4], x5
+ urhadd v2.8b, v0.8b, v1.8b
+ st1 {v2.8b}, [x0], x1
+.endr
+ ret
+endfunc
+.endm
+
+pixel_avg_pp_8xN_neon 4
+pixel_avg_pp_8xN_neon 8
+pixel_avg_pp_8xN_neon 16
+pixel_avg_pp_8xN_neon 32
diff --git a/source/common/aarch64/pixel-util.S
b/source/common/aarch64/pixel-util.S
new file mode 100644
index 000000000..a085ebdfa
--- /dev/null
+++ b/source/common/aarch64/pixel-util.S
@@ -0,0 +1,419 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su at huawei.com>
+ * Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro x265_satd_4x8_8x4_end_neon
+ add v0.8h, v4.8h, v6.8h
+ add v1.8h, v5.8h, v7.8h
+ sub v2.8h, v4.8h, v6.8h
+ sub v3.8h, v5.8h, v7.8h
+
+ trn1 v16.8h, v0.8h, v1.8h
+ trn2 v17.8h, v0.8h, v1.8h
+ add v4.8h, v16.8h, v17.8h
+ trn1 v18.8h, v2.8h, v3.8h
+ trn2 v19.8h, v2.8h, v3.8h
+ sub v5.8h, v16.8h, v17.8h
+ add v6.8h, v18.8h, v19.8h
+ sub v7.8h, v18.8h, v19.8h
+ trn1 v0.4s, v4.4s, v6.4s
+ trn2 v2.4s, v4.4s, v6.4s
+ abs v0.8h, v0.8h
+ trn1 v1.4s, v5.4s, v7.4s
+ trn2 v3.4s, v5.4s, v7.4s
+ abs v2.8h, v2.8h
+ abs v1.8h, v1.8h
+ abs v3.8h, v3.8h
+ umax v0.8h, v0.8h, v2.8h
+ umax v1.8h, v1.8h, v3.8h
+ add v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+.endm
+
+.macro pixel_satd_4x8_neon
+ ld1r {v1.2s}, [x2], x3
+ ld1r {v0.2s}, [x0], x1
+ ld1r {v3.2s}, [x2], x3
+ ld1r {v2.2s}, [x0], x1
+ ld1r {v5.2s}, [x2], x3
+ ld1r {v4.2s}, [x0], x1
+ ld1r {v7.2s}, [x2], x3
+ ld1r {v6.2s}, [x0], x1
+
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ usubl v0.8h, v0.8b, v1.8b
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+ usubl v1.8h, v2.8b, v3.8b
+ ld1 {v5.s}[1], [x2], x3
+ ld1 {v4.s}[1], [x0], x1
+ usubl v2.8h, v4.8b, v5.8b
+ ld1 {v7.s}[1], [x2], x3
+ add v4.8h, v0.8h, v1.8h
+ sub v5.8h, v0.8h, v1.8h
+ ld1 {v6.s}[1], [x0], x1
+ usubl v3.8h, v6.8b, v7.8b
+ add v6.8h, v2.8h, v3.8h
+ sub v7.8h, v2.8h, v3.8h
+ x265_satd_4x8_8x4_end_neon
+.endm
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_4x8_neon
+ pixel_satd_4x8_neon
+ mov w0, v0.s[0]
+ ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_4x16_neon
+ eor w4, w4, w4
+ pixel_satd_4x8_neon
+ mov w5, v0.s[0]
+ add w4, w4, w5
+ pixel_satd_4x8_neon
+ mov w5, v0.s[0]
+ add w0, w5, w4
+ ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_4x32_neon
+ eor w4, w4, w4
+.rept 4
+ pixel_satd_4x8_neon
+ mov w5, v0.s[0]
+ add w4, w4, w5
+.endr
+ mov w0, w4
+ ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_12x16_neon
+ mov x4, x0
+ mov x5, x2
+ eor w7, w7, w7
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+
+ add x0, x4, #4
+ add x2, x5, #4
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+
+ add x0, x4, #8
+ add x2, x5, #8
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w0, w7, w6
+ ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_12x32_neon
+ mov x4, x0
+ mov x5, x2
+ eor w7, w7, w7
+.rept 4
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+.endr
+
+ add x0, x4, #4
+ add x2, x5, #4
+.rept 4
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+.endr
+
+ add x0, x4, #8
+ add x2, x5, #8
+.rept 4
+ pixel_satd_4x8_neon
+ mov w6, v0.s[0]
+ add w7, w7, w6
+.endr
+
+ mov w0, w7
+ ret
+endfunc
+
+// template<int w, int h>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
intptr_t stride_pix2)
+function x265_pixel_satd_8x8_neon
+ eor w4, w4, w4
+ mov x6, x0
+ mov x7, x2
+ pixel_satd_4x8_neon
+ mov w5, v0.s[0]
+ add w4, w4, w5
+ add x0, x6, #4
+ add x2, x7, #4
+ pixel_satd_4x8_neon
+ mov w5, v0.s[0]
+ add w0, w4, w5
+ ret
+endfunc
+
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel*
recon, intptr_t rstride)
+function x265_psyCost_4x4_neon
+ ld1r {v4.2s}, [x0], x1
+ ld1r {v5.2s}, [x0], x1
+ ld1 {v4.s}[1], [x0], x1
+ ld1 {v5.s}[1], [x0], x1
+
+ ld1r {v6.2s}, [x2], x3
+ ld1r {v7.2s}, [x2], x3
+ ld1 {v6.s}[1], [x2], x3
+ ld1 {v7.s}[1], [x2], x3
+
+ uaddl v2.8h, v4.8b, v5.8b
+ usubl v3.8h, v4.8b, v5.8b
+ uaddl v18.8h, v6.8b, v7.8b
+ usubl v19.8h, v6.8b, v7.8b
+
+ mov v20.d[0], v2.d[1]
+ add v0.4h, v2.4h, v20.4h
+ sub v1.4h, v2.4h, v20.4h
+ mov v21.d[0], v3.d[1]
+ add v22.4h, v3.4h, v21.4h
+ sub v23.4h, v3.4h, v21.4h
+
+ mov v24.d[0], v18.d[1]
+ add v16.4h, v18.4h, v24.4h
+ sub v17.4h, v18.4h, v24.4h
+ mov v25.d[0], v19.d[1]
+ add v26.4h, v19.4h, v25.4h
+ sub v27.4h, v19.4h, v25.4h
+
+ mov v0.d[1], v22.d[0]
+ mov v1.d[1], v23.d[0]
+ trn1 v22.8h, v0.8h, v1.8h
+ trn2 v23.8h, v0.8h, v1.8h
+ mov v16.d[1], v26.d[0]
+ mov v17.d[1], v27.d[0]
+ trn1 v26.8h, v16.8h, v17.8h
+ trn2 v27.8h, v16.8h, v17.8h
+
+ add v2.8h, v22.8h, v23.8h
+ sub v3.8h, v22.8h, v23.8h
+ add v18.8h, v26.8h, v27.8h
+ sub v19.8h, v26.8h, v27.8h
+
+ uaddl v20.8h, v4.8b, v5.8b
+ uaddl v21.8h, v6.8b, v7.8b
+
+ trn1 v0.4s, v2.4s, v3.4s
+ trn2 v1.4s, v2.4s, v3.4s
+ trn1 v16.4s, v18.4s, v19.4s
+ trn2 v17.4s, v18.4s, v19.4s
+ abs v0.8h, v0.8h
+ abs v16.8h, v16.8h
+ abs v1.8h, v1.8h
+ abs v17.8h, v17.8h
+
+ uaddlv s20, v20.8h
+ uaddlv s21, v21.8h
+ mov v20.s[1], v21.s[0]
+
+ smax v0.8h, v0.8h, v1.8h
+ smax v16.8h, v16.8h, v17.8h
+
+ trn1 v4.2d, v0.2d, v16.2d
+ trn2 v5.2d, v0.2d, v16.2d
+ add v0.8h, v4.8h, v5.8h
+ mov v4.d[0], v0.d[1]
+ uaddlv s0, v0.4h
+ uaddlv s4, v4.4h
+
+ ushr v20.2s, v20.2s, #2
+ mov v0.s[1], v4.s[0]
+ sub v0.2s, v0.2s, v20.2s
+ mov w0, v0.s[0]
+ mov w1, v0.s[1]
+ subs w0, w0, w1
+ cneg w0, w0, mi
+
+ ret
+endfunc
+
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
+function x265_quant_neon
+ mov w9, #1
+ lsl w9, w9, w4
+ dup v0.2s, w9
+ neg w9, w4
+ dup v1.4s, w9
+ add w9, w9, #8
+ dup v2.4s, w9
+ dup v3.4s, w5
+
+ lsr w6, w6, #2
+ eor v4.16b, v4.16b, v4.16b
+ eor w10, w10, w10
+ eor v17.16b, v17.16b, v17.16b
+
+.loop_quant:
+
+ ld1 {v18.4h}, [x0], #8
+ ld1 {v7.4s}, [x1], #16
+ sxtl v6.4s, v18.4h
+
+ cmlt v5.4s, v6.4s, #0
+
+ abs v6.4s, v6.4s
+
+
+ mul v6.4s, v6.4s, v7.4s
+
+ add v7.4s, v6.4s, v3.4s
+ sshl v7.4s, v7.4s, v1.4s
+
+ mls v6.4s, v7.4s, v0.s[0]
+ sshl v16.4s, v6.4s, v2.4s
+ st1 {v16.4s}, [x2], #16
+
+ // numsig
+ cmeq v16.4s, v7.4s, v17.4s
+ add v4.4s, v4.4s, v16.4s
+ add w10, w10, #4
+
+ // level *= sign
+ eor v16.16b, v7.16b, v5.16b
+ sub v16.4s, v16.4s, v5.4s
+ sqxtn v5.4h, v16.4s
+ st1 {v5.4h}, [x3], #8
+
+ subs w6, w6, #1
+ b.ne .loop_quant
+
+ addv s4, v4.4s
+ mov w9, v4.s[0]
+ add w0, w10, w9
+ ret
+endfunc
+
+.macro satd_4x4_neon
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v0.s}[0], [x0], x1
+ ld1 {v3.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x0], x1
+
+ ld1 {v1.s}[1], [x2], x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v3.s}[1], [x2], x3
+ ld1 {v2.s}[1], [x0], x1
+
+ usubl v4.8h, v0.8b, v1.8b
+ usubl v5.8h, v2.8b, v3.8b
+
+ add v6.8h, v4.8h, v5.8h
+ sub v7.8h, v4.8h, v5.8h
+
+ mov v4.d[0], v6.d[1]
+ add v0.8h, v6.8h, v4.8h
+ sub v2.8h, v6.8h, v4.8h
+
+ mov v5.d[0], v7.d[1]
+ add v1.8h, v7.8h, v5.8h
+ sub v3.8h, v7.8h, v5.8h
+
+ trn1 v4.4h, v0.4h, v1.4h
+ trn2 v5.4h, v0.4h, v1.4h
+
+ trn1 v6.4h, v2.4h, v3.4h
+ trn2 v7.4h, v2.4h, v3.4h
+
+ add v0.4h, v4.4h, v5.4h
+ sub v1.4h, v4.4h, v5.4h
+
+ add v2.4h, v6.4h, v7.4h
+ sub v3.4h, v6.4h, v7.4h
+
+ trn1 v4.2s, v0.2s, v1.2s
+ trn2 v5.2s, v0.2s, v1.2s
+
+ trn1 v6.2s, v2.2s, v3.2s
+ trn2 v7.2s, v2.2s, v3.2s
+
+ abs v4.4h, v4.4h
+ abs v5.4h, v5.4h
+ abs v6.4h, v6.4h
+ abs v7.4h, v7.4h
+
+ smax v1.4h, v4.4h, v5.4h
+ smax v2.4h, v6.4h, v7.4h
+
+ add v0.4h, v1.4h, v2.4h
+ uaddlp v0.2s, v0.4h
+ uaddlp v0.1d, v0.2s
+.endm
+
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
pix2, intptr_t stride_pix2)
+function x265_pixel_satd_4x4_neon
+ satd_4x4_neon
+ umov x0, v0.d[0]
+ ret
+endfunc
+
+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
pix2, intptr_t stride_pix2)
+function x265_pixel_satd_8x4_neon
+ mov x4, x0
+ mov x5, x2
+ satd_4x4_neon
+ add x0, x4, #4
+ add x2, x5, #4
+ umov x6, v0.d[0]
+ satd_4x4_neon
+ umov x0, v0.d[0]
+ add x0, x0, x6
+ ret
+endfunc
diff --git a/source/common/aarch64/pixel-util.h
b/source/common/aarch64/pixel-util.h
new file mode 100644
index 000000000..043488468
--- /dev/null
+++ b/source/common/aarch64/pixel-util.h
@@ -0,0 +1,40 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Yimeng Su <yimeng.su at huawei.com>
+ * Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#ifndef X265_PIXEL_UTIL_AARCH64_H
+#define X265_PIXEL_UTIL_AARCH64_H
+
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1,
const pixel* pix2, intptr_t stride_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const
pixel* recon, intptr_t rstride);
+
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h
new file mode 100644
index 000000000..179c2f4ec
--- /dev/null
+++ b/source/common/aarch64/pixel.h
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#ifndef X265_I386_PIXEL_AARCH64_H
+#define X265_I386_PIXEL_AARCH64_H
+
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const
pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
+
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
+
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const
pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
int32_t* res);
+
+#endif // ifndef X265_I386_PIXEL_AARCH64_H
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
new file mode 100644
index 000000000..c27cce5ce
--- /dev/null
+++ b/source/common/aarch64/sad-a.S
@@ -0,0 +1,105 @@
+/*****************************************************************************
+ * Copyright (C) 2020 MulticoreWare, Inc
+ *
+ * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+.text
+
+.macro SAD_X_START_8 x
+ ld1 {v0.8b}, [x0], x9
+.if \x == 3
+ ld1 {v1.8b}, [x1], x4
+ ld1 {v2.8b}, [x2], x4
+ ld1 {v3.8b}, [x3], x4
+.elseif \x == 4
+ ld1 {v1.8b}, [x1], x5
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v3.8b}, [x3], x5
+ ld1 {v4.8b}, [x4], x5
+.endif
+ uabdl v16.8h, v0.8b, v1.8b
+ uabdl v17.8h, v0.8b, v2.8b
+ uabdl v18.8h, v0.8b, v3.8b
+.if \x == 4
+ uabdl v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8 x
+ ld1 {v0.8b}, [x0], x9
+.if \x == 3
+ ld1 {v1.8b}, [x1], x4
+ ld1 {v2.8b}, [x2], x4
+ ld1 {v3.8b}, [x3], x4
+.elseif \x == 4
+ ld1 {v1.8b}, [x1], x5
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v3.8b}, [x3], x5
+ ld1 {v4.8b}, [x4], x5
+.endif
+ uabal v16.8h, v0.8b, v1.8b
+ uabal v17.8h, v0.8b, v2.8b
+ uabal v18.8h, v0.8b, v3.8b
+.if \x == 4
+ uabal v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8xN x, h
+function x265_sad_x\x\()_8x\h\()_neon
+ mov x9, #FENC_STRIDE
+ SAD_X_START_8 \x
+.rept \h - 1
+ SAD_X_8 \x
+.endr
+ uaddlv s0, v16.8h
+ uaddlv s1, v17.8h
+ uaddlv s2, v18.8h
+.if \x == 4
+ uaddlv s3, v19.8h
+.endif
+
+.if \x == 3
+ stp s0, s1, [x5]
+ str s2, [x5, #8]
+.elseif \x == 4
+ stp s0, s1, [x6]
+ stp s2, s3, [x6, #8]
+.endif
+ ret
+endfunc
+.endm
+
+SAD_X_8xN 3 4
+SAD_X_8xN 3 8
+SAD_X_8xN 3 16
+SAD_X_8xN 3 32
+
+SAD_X_8xN 4 4
+SAD_X_8xN 4 8
+SAD_X_8xN 4 16
+SAD_X_8xN 4 32
--
2.21.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20200227/14e2b34d/attachment-0001.html>
More information about the x265-devel
mailing list