<div dir="ltr"><div class="gmail_quote">From: wangxiyuan <<a href="mailto:wangxiyuan@huawei.com" target="_blank">wangxiyuan@huawei.com</a>><br>
<br>
This patch add some common assembly optimization function for aarch64<br>
platform. These function won't work until the patch Part 2 is merged.<br>
---<br>
source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++<br>
source/common/aarch64/asm.S | 69 ++++<br>
source/common/aarch64/ipfilter8.S | 414 ++++++++++++++++++++++<br>
source/common/aarch64/ipfilter8.h | 55 +++<br>
source/common/aarch64/mc-a.S | 63 ++++<br>
source/common/aarch64/pixel-util.S | 419 +++++++++++++++++++++++<br>
source/common/aarch64/pixel-util.h | 40 +++<br>
source/common/aarch64/pixel.h | 105 ++++++<br>
source/common/aarch64/sad-a.S | 105 ++++++<br>
9 files changed, 1489 insertions(+)<br>
create mode 100644 source/common/aarch64/asm-primitives.cpp<br>
create mode 100644 source/common/aarch64/asm.S<br>
create mode 100644 source/common/aarch64/ipfilter8.S<br>
create mode 100644 source/common/aarch64/ipfilter8.h<br>
create mode 100644 source/common/aarch64/mc-a.S<br>
create mode 100644 source/common/aarch64/pixel-util.S<br>
create mode 100644 source/common/aarch64/pixel-util.h<br>
create mode 100644 source/common/aarch64/pixel.h<br>
create mode 100644 source/common/aarch64/sad-a.S<br>
<br>
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp<br>
new file mode 100644<br>
index 000000000..6fe8c968c<br>
--- /dev/null<br>
+++ b/source/common/aarch64/asm-primitives.cpp<br>
@@ -0,0 +1,219 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ * Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "common.h"<br>
+#include "primitives.h"<br>
+#include "x265.h"<br>
+#include "cpu.h"<br>
+<br>
+<br>
+#if defined(__GNUC__)<br>
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)<br>
+#endif<br>
+<br>
+#define GCC_4_9_0 40900<br>
+#define GCC_5_1_0 50100<br>
+<br>
+extern "C" {<br>
+#include "pixel.h"<br>
+#include "pixel-util.h"<br>
+#include "ipfilter8.h"<br>
+}<br>
+<br>
+namespace X265_NS {<br>
+// private x265 namespace<br>
+<br>
+<br>
+template<int size><br>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)<br>
+{<br>
+ ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);<br>
+ const int halfFilterSize = NTAPS_LUMA >> 1;<br>
+ const int immedStride = MAX_CU_SIZE;<br>
+<br>
+ primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);<br>
+ primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);<br>
+}<br>
+<br>
+<br>
+/* Temporary workaround because luma_vsp assembly primitive has not been completed<br>
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.<br>
+ * Otherwise, segment fault occurs. */<br>
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)<br>
+{<br>
+ if (cpuMask & X265_CPU_NEON)<br>
+ {<br>
+ asmp.pu[LUMA_8x4].luma_vsp = cp.pu[LUMA_8x4].luma_vsp;<br>
+ asmp.pu[LUMA_8x8].luma_vsp = cp.pu[LUMA_8x8].luma_vsp;<br>
+ asmp.pu[LUMA_8x16].luma_vsp = cp.pu[LUMA_8x16].luma_vsp;<br>
+ asmp.pu[LUMA_8x32].luma_vsp = cp.pu[LUMA_8x32].luma_vsp;<br>
+ asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+ asmp.pu[LUMA_16x4].luma_vsp = cp.pu[LUMA_16x4].luma_vsp;<br>
+ asmp.pu[LUMA_16x8].luma_vsp = cp.pu[LUMA_16x8].luma_vsp;<br>
+ asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;<br>
+ asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;<br>
+ asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;<br>
+ asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;<br>
+ asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;<br>
+ asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;<br>
+ asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;<br>
+ asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;<br>
+ asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;<br>
+ asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;<br>
+ asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;<br>
+ asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;<br>
+ asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp; <br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>
+ asmp.pu[LUMA_4x4].luma_vsp = cp.pu[LUMA_4x4].luma_vsp;<br>
+ asmp.pu[LUMA_4x8].luma_vsp = cp.pu[LUMA_4x8].luma_vsp;<br>
+ asmp.pu[LUMA_4x16].luma_vsp = cp.pu[LUMA_4x16].luma_vsp;<br>
+ asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;<br>
+ asmp.pu[LUMA_32x8].luma_vsp = cp.pu[LUMA_32x8].luma_vsp;<br>
+#endif<br>
+#endif<br>
+ }<br>
+}<br>
+<br>
+<br>
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) <br>
+{<br>
+ if (cpuMask & X265_CPU_NEON)<br>
+ {<br>
+ p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_neon);<br>
+ p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);<br>
+ p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);<br>
+ p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_neon);<br>
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);<br>
+ p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);<br>
+ <br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_neon);<br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);<br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);<br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_neon);<br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);<br>
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);<br>
+ <br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);<br>
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);<br>
+<br>
+ p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_4x4_neon);<br>
+ p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_4x8_neon);<br>
+ p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_4x16_neon);<br>
+ p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_8x4_neon);<br>
+ p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_8x8_neon);<br>
+ p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_8x16_neon);<br>
+ p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_8x32_neon);<br>
+<br>
+ p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_4x4_neon);<br>
+ p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_4x8_neon);<br>
+ p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_4x16_neon);<br>
+ p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x4_neon);<br>
+ p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x8_neon);<br>
+ p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x16_neon);<br>
+ p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] = PFX(pixel_avg_pp_8x32_neon);<br>
+<br>
+ p.pu[LUMA_8x4].sad_x3 = PFX(sad_x3_8x4_neon);<br>
+ p.pu[LUMA_8x8].sad_x3 = PFX(sad_x3_8x8_neon);<br>
+ p.pu[LUMA_8x16].sad_x3 = PFX(sad_x3_8x16_neon);<br>
+ p.pu[LUMA_8x32].sad_x3 = PFX(sad_x3_8x32_neon);<br>
+<br>
+ p.pu[LUMA_8x4].sad_x4 = PFX(sad_x4_8x4_neon);<br>
+ p.pu[LUMA_8x8].sad_x4 = PFX(sad_x4_8x8_neon);<br>
+ p.pu[LUMA_8x16].sad_x4 = PFX(sad_x4_8x16_neon);<br>
+ p.pu[LUMA_8x32].sad_x4 = PFX(sad_x4_8x32_neon);<br>
+<br>
+ // quant<br>
+ p.quant = PFX(quant_neon);<br>
+ // luma_hps<br>
+ p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_neon);<br>
+ p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_neon);<br>
+ p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_neon);<br>
+ p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_neon);<br>
+ p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_neon);<br>
+ p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_neon);<br>
+ p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_neon);<br>
+ p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);<br>
+ p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+ p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_neon);<br>
+ p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_neon);<br>
+ p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);<br>
+ p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);<br>
+ p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);<br>
+ p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);<br>
+ p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_neon);<br>
+ p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);<br>
+ p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);<br>
+ p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);<br>
+ p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);<br>
+ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);<br>
+ p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);<br>
+ p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);<br>
+ p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);<br>
+ p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);<br>
+#endif<br>
+<br>
+ p.pu[LUMA_8x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x4>;<br>
+ p.pu[LUMA_8x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x8>;<br>
+ p.pu[LUMA_8x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x16>;<br>
+ p.pu[LUMA_8x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x32>;<br>
+ p.pu[LUMA_12x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_12x16>;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+ p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;<br>
+ p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;<br>
+ p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;<br>
+ p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;<br>
+ p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;<br>
+ p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;<br>
+ p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;<br>
+ p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;<br>
+ p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;<br>
+ p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;<br>
+ p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;<br>
+ p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;<br>
+ p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;<br>
+ p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;<br>
+ p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>
+ p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;<br>
+ p.pu[LUMA_4x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x8>;<br>
+ p.pu[LUMA_4x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x16>;<br>
+ p.pu[LUMA_24x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_24x32>;<br>
+ p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;<br>
+#endif<br>
+#endif<br>
+<br>
+#if !HIGH_BIT_DEPTH<br>
+ <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);<br>
+#endif // !HIGH_BIT_DEPTH<br>
+<br>
+ }<br>
+}<br>
+} // namespace X265_NS<br>
diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S<br>
new file mode 100644<br>
index 000000000..5f020a11a<br>
--- /dev/null<br>
+++ b/source/common/aarch64/asm.S<br>
@@ -0,0 +1,69 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+.arch armv8-a<br>
+<br>
+#ifdef PREFIX<br>
+#define EXTERN_ASM _<br>
+#else<br>
+#define EXTERN_ASM<br>
+#endif<br>
+<br>
+#ifdef __ELF__<br>
+#define ELF<br>
+#else<br>
+#define ELF @<br>
+#endif<br>
+<br>
+#define HAVE_AS_FUNC 1<br>
+<br>
+#if HAVE_AS_FUNC<br>
+#define FUNC<br>
+#else<br>
+#define FUNC @<br>
+#endif<br>
+<br>
+.macro function name, export=1<br>
+ .macro endfunc<br>
+ELF .size \name, . - \name<br>
+FUNC .endfunc<br>
+ .purgem endfunc<br>
+ .endm<br>
+ .align 2<br>
+.if \export == 1<br>
+ .global EXTERN_ASM\name<br>
+ELF .hidden EXTERN_ASM\name<br>
+ELF .type EXTERN_ASM\name, %function<br>
+FUNC .func EXTERN_ASM\name<br>
+EXTERN_ASM\name:<br>
+.else<br>
+ELF .hidden \name<br>
+ELF .type \name, %function<br>
+FUNC .func \name<br>
+\name:<br>
+.endif<br>
+.endm<br>
+<br>
+<br>
+#define FENC_STRIDE 64<br>
+#define FDEC_STRIDE 32<br>
diff --git a/source/common/aarch64/ipfilter8.S b/source/common/aarch64/ipfilter8.S<br>
new file mode 100644<br>
index 000000000..908c7db46<br>
--- /dev/null<br>
+++ b/source/common/aarch64/ipfilter8.S<br>
@@ -0,0 +1,414 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+<br>
+<br>
+.macro qpel_filter_0_32b<br>
+ movi v24.8h, #64<br>
+ uxtl v19.8h, v5.8b<br>
+ smull v17.4s, v19.4h, v24.4h<br>
+ smull2 v18.4s, v19.8h, v24.8h<br>
+.endm<br>
+<br>
+.macro qpel_filter_1_32b<br>
+ movi v16.8h, #58<br>
+ uxtl v19.8h, v5.8b<br>
+ smull v17.4s, v19.4h, v16.4h<br>
+ smull2 v18.4s, v19.8h, v16.8h<br>
+<br>
+ movi v24.8h, #10<br>
+ uxtl v21.8h, v1.8b<br>
+ smull v19.4s, v21.4h, v24.4h<br>
+ smull2 v20.4s, v21.8h, v24.8h<br>
+<br>
+ movi v16.8h, #17<br>
+ uxtl v23.8h, v2.8b<br>
+ smull v21.4s, v23.4h, v16.4h<br>
+ smull2 v22.4s, v23.8h, v16.8h<br>
+<br>
+ movi v24.8h, #5<br>
+ uxtl v1.8h, v6.8b<br>
+ smull v23.4s, v1.4h, v24.4h<br>
+ smull2 v16.4s, v1.8h, v24.8h<br>
+<br>
+ sub v17.4s, v17.4s, v19.4s<br>
+ sub v18.4s, v18.4s, v20.4s<br>
+<br>
+ uxtl v1.8h, v4.8b<br>
+ sshll v19.4s, v1.4h, #2<br>
+ sshll2 v20.4s, v1.8h, #2<br>
+<br>
+ add v17.4s, v17.4s, v21.4s<br>
+ add v18.4s, v18.4s, v22.4s<br>
+<br>
+ uxtl v1.8h, v0.8b<br>
+ uxtl v2.8h, v3.8b<br>
+ ssubl v21.4s, v2.4h, v1.4h<br>
+ ssubl2 v22.4s, v2.8h, v1.8h<br>
+<br>
+ add v17.4s, v17.4s, v19.4s<br>
+ add v18.4s, v18.4s, v20.4s<br>
+ sub v21.4s, v21.4s, v23.4s<br>
+ sub v22.4s, v22.4s, v16.4s<br>
+ add v17.4s, v17.4s, v21.4s<br>
+ add v18.4s, v18.4s, v22.4s<br>
+.endm<br>
+<br>
+.macro qpel_filter_2_32b<br>
+ movi v16.4s, #11<br>
+ uxtl v19.8h, v5.8b<br>
+ uxtl v20.8h, v2.8b<br>
+ saddl v17.4s, v19.4h, v20.4h<br>
+ saddl2 v18.4s, v19.8h, v20.8h<br>
+<br>
+ uxtl v21.8h, v1.8b<br>
+ uxtl v22.8h, v6.8b<br>
+ saddl v19.4s, v21.4h, v22.4h<br>
+ saddl2 v20.4s, v21.8h, v22.8h<br>
+<br>
+ mul v19.4s, v19.4s, v16.4s<br>
+ mul v20.4s, v20.4s, v16.4s<br>
+<br>
+ movi v16.4s, #40<br>
+ mul v17.4s, v17.4s, v16.4s<br>
+ mul v18.4s, v18.4s, v16.4s<br>
+<br>
+ uxtl v21.8h, v4.8b<br>
+ uxtl v22.8h, v3.8b<br>
+ saddl v23.4s, v21.4h, v22.4h<br>
+ saddl2 v16.4s, v21.8h, v22.8h<br>
+<br>
+ uxtl v1.8h, v0.8b<br>
+ uxtl v2.8h, v7.8b<br>
+ saddl v21.4s, v1.4h, v2.4h<br>
+ saddl2 v22.4s, v1.8h, v2.8h<br>
+<br>
+ shl v23.4s, v23.4s, #2<br>
+ shl v16.4s, v16.4s, #2<br>
+<br>
+ add v19.4s, v19.4s, v21.4s<br>
+ add v20.4s, v20.4s, v22.4s<br>
+ add v17.4s, v17.4s, v23.4s<br>
+ add v18.4s, v18.4s, v16.4s<br>
+ sub v17.4s, v17.4s, v19.4s<br>
+ sub v18.4s, v18.4s, v20.4s<br>
+.endm<br>
+<br>
+.macro qpel_filter_3_32b<br>
+ movi v16.8h, #17<br>
+ movi v24.8h, #5<br>
+<br>
+ uxtl v19.8h, v5.8b<br>
+ smull v17.4s, v19.4h, v16.4h<br>
+ smull2 v18.4s, v19.8h, v16.8h<br>
+<br>
+ uxtl v21.8h, v1.8b<br>
+ smull v19.4s, v21.4h, v24.4h<br>
+ smull2 v20.4s, v21.8h, v24.8h<br>
+<br>
+ movi v16.8h, #58<br>
+ uxtl v23.8h, v2.8b<br>
+ smull v21.4s, v23.4h, v16.4h<br>
+ smull2 v22.4s, v23.8h, v16.8h<br>
+<br>
+ movi v24.8h, #10<br>
+ uxtl v1.8h, v6.8b<br>
+ smull v23.4s, v1.4h, v24.4h<br>
+ smull2 v16.4s, v1.8h, v24.8h<br>
+<br>
+ sub v17.4s, v17.4s, v19.4s<br>
+ sub v18.4s, v18.4s, v20.4s<br>
+<br>
+ uxtl v1.8h, v3.8b<br>
+ sshll v19.4s, v1.4h, #2<br>
+ sshll2 v20.4s, v1.8h, #2<br>
+<br>
+ add v17.4s, v17.4s, v21.4s<br>
+ add v18.4s, v18.4s, v22.4s<br>
+<br>
+ uxtl v1.8h, v4.8b<br>
+ uxtl v2.8h, v7.8b<br>
+ ssubl v21.4s, v1.4h, v2.4h<br>
+ ssubl2 v22.4s, v1.8h, v2.8h<br>
+<br>
+ add v17.4s, v17.4s, v19.4s<br>
+ add v18.4s, v18.4s, v20.4s<br>
+ sub v21.4s, v21.4s, v23.4s<br>
+ sub v22.4s, v22.4s, v16.4s<br>
+ add v17.4s, v17.4s, v21.4s<br>
+ add v18.4s, v18.4s, v22.4s<br>
+.endm<br>
+<br>
+<br>
+<br>
+<br>
+.macro vextin8<br>
+ ld1 {v3.16b}, [x11], #16<br>
+ mov v7.d[0], v3.d[1]<br>
+ ext v0.8b, v3.8b, v7.8b, #1<br>
+ ext v4.8b, v3.8b, v7.8b, #2<br>
+ ext v1.8b, v3.8b, v7.8b, #3<br>
+ ext v5.8b, v3.8b, v7.8b, #4<br>
+ ext v2.8b, v3.8b, v7.8b, #5<br>
+ ext v6.8b, v3.8b, v7.8b, #6<br>
+ ext v3.8b, v3.8b, v7.8b, #7<br>
+.endm<br>
+<br>
+<br>
+<br>
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+.macro HPS_FILTER a b filterhps<br>
+ mov w12, #8192<br>
+ mov w6, w10<br>
+ sub x3, x3, #\a<br>
+ lsl x3, x3, #1<br>
+ mov w9, #\a<br>
+ cmp w9, #4<br>
+ b.eq 14f<br>
+ cmp w9, #12<br>
+ b.eq 15f<br>
+ b 7f<br>
+14:<br>
+ HPS_FILTER_4 \a \b \filterhps<br>
+ b 10f<br>
+15:<br>
+ HPS_FILTER_12 \a \b \filterhps<br>
+ b 10f<br>
+7:<br>
+ cmp w5, #0<br>
+ b.eq 8f<br>
+ cmp w5, #1<br>
+ b.eq 9f<br>
+8:<br>
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>
+ mov w7, #\a<br>
+ lsr w7, w7, #3<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ sub v18.4s, v18.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ xtn2 v0.8h, v18.4s<br>
+ st1 {v0.8h}, [x2], #16<br>
+ subs w7, w7, #1<br>
+ sub x11, x11, #8<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop2_hps_\filterhps\()_\a\()x\b\()_rowext0<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ add x2, x2, x3<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop1_hps_\filterhps\()_\a\()x\b\()_rowext0<br>
+ b 10f<br>
+9:<br>
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>
+ mov w7, #\a<br>
+ lsr w7, w7, #3<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ sub v18.4s, v18.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ xtn2 v0.8h, v18.4s<br>
+ st1 {v0.8h}, [x2], #16<br>
+ subs w7, w7, #1<br>
+ sub x11, x11, #8<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop4_hps_\filterhps\()_\a\()x\b\()_rowext1<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ add x2, x2, x3<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop3_hps_\filterhps\()_\a\()x\b\()_rowext1<br>
+10:<br>
+.endm<br>
+<br>
+.macro HPS_FILTER_4 w h filterhps<br>
+ cmp w5, #0<br>
+ b.eq 11f<br>
+ cmp w5, #1<br>
+ b.eq 12f<br>
+11:<br>
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ st1 {v0.4h}, [x2], #8<br>
+ sub x11, x11, #8<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ add x2, x2, x3<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop4_hps_\filterhps\()_\w\()x\h\()_rowext0<br>
+ b 13f<br>
+12:<br>
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ st1 {v0.4h}, [x2], #8<br>
+ sub x11, x11, #8<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ add x2, x2, x3<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop5_hps_\filterhps\()_\w\()x\h\()_rowext1<br>
+13:<br>
+.endm<br>
+<br>
+.macro HPS_FILTER_12 w h filterhps<br>
+ cmp w5, #0<br>
+ b.eq 14f<br>
+ cmp w5, #1<br>
+ b.eq 15f<br>
+14:<br>
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ sub v18.4s, v18.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ xtn2 v0.8h, v18.4s<br>
+ st1 {v0.8h}, [x2], #16<br>
+ sub x11, x11, #8<br>
+<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ st1 {v0.4h}, [x2], #8<br>
+ add x2, x2, x3<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop12_hps_\filterhps\()_\w\()x\h\()_rowext0<br>
+ b 16f<br>
+15:<br>
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>
+ mov x11, x0<br>
+ sub x11, x11, #4<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ sub v18.4s, v18.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ xtn2 v0.8h, v18.4s<br>
+ st1 {v0.8h}, [x2], #16<br>
+ sub x11, x11, #8<br>
+<br>
+ vextin8<br>
+ \filterhps<br>
+ dup v16.4s, w12<br>
+ sub v17.4s, v17.4s, v16.4s<br>
+ xtn v0.4h, v17.4s<br>
+ st1 {v0.4h}, [x2], #8<br>
+ add x2, x2, x3<br>
+ subs w6, w6, #1<br>
+ add x0, x0, x1<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> loop12_hps_\filterhps\()_\w\()x\h\()_rowext1<br>
+16:<br>
+.endm<br>
+<br>
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+.macro LUMA_HPS w h<br>
+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon<br>
+ mov w10, #\h<br>
+ cmp w5, #0<br>
+ b.eq 6f<br>
+ sub x0, x0, x1, lsl #2<br>
+<br>
+ add x0, x0, x1<br>
+ add w10, w10, #7<br>
+6:<br>
+ cmp w4, #0<br>
+ b.eq 0f<br>
+ cmp w4, #1<br>
+ b.eq 1f<br>
+ cmp w4, #2<br>
+ b.eq 2f<br>
+ cmp w4, #3<br>
+ b.eq 3f<br>
+0:<br>
+ HPS_FILTER \w \h qpel_filter_0_32b<br>
+ b 5f<br>
+1:<br>
+ HPS_FILTER \w \h qpel_filter_1_32b<br>
+ b 5f<br>
+2:<br>
+ HPS_FILTER \w \h qpel_filter_2_32b<br>
+ b 5f<br>
+3:<br>
+ HPS_FILTER \w \h qpel_filter_3_32b<br>
+ b 5f<br>
+5:<br>
+ ret<br>
+endfunc<br>
+.endm<br>
+<br>
+LUMA_HPS 4 4<br>
+LUMA_HPS 4 8<br>
+LUMA_HPS 4 16<br>
+LUMA_HPS 8 4<br>
+LUMA_HPS 8 8<br>
+LUMA_HPS 8 16<br>
+LUMA_HPS 8 32<br>
+LUMA_HPS 12 16<br>
+LUMA_HPS 16 4<br>
+LUMA_HPS 16 8<br>
+LUMA_HPS 16 12<br>
+LUMA_HPS 16 16<br>
+LUMA_HPS 16 32<br>
+LUMA_HPS 16 64<br>
+LUMA_HPS 24 32<br>
+LUMA_HPS 32 8<br>
+LUMA_HPS 32 16<br>
+LUMA_HPS 32 24<br>
+LUMA_HPS 32 32<br>
+LUMA_HPS 32 64<br>
+LUMA_HPS 48 64<br>
+LUMA_HPS 64 16<br>
+LUMA_HPS 64 32<br>
+LUMA_HPS 64 48<br>
+LUMA_HPS 64 64<br>
diff --git a/source/common/aarch64/ipfilter8.h b/source/common/aarch64/ipfilter8.h<br>
new file mode 100644<br>
index 000000000..f9ed91e2e<br>
--- /dev/null<br>
+++ b/source/common/aarch64/ipfilter8.h<br>
@@ -0,0 +1,55 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_IPFILTER8_AARCH64_H<br>
+#define X265_IPFILTER8_AARCH64_H<br>
+<br>
+<br>
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+<br>
+<br>
+#endif // ifndef X265_IPFILTER8_AARCH64_H<br>
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S<br>
new file mode 100644<br>
index 000000000..cbaf9b501<br>
--- /dev/null<br>
+++ b/source/common/aarch64/mc-a.S<br>
@@ -0,0 +1,63 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro pixel_avg_pp_4xN_neon h<br>
+function x265_pixel_avg_pp_4x\h\()_neon<br>
+.rept \h<br>
+ ld1 {v0.s}[0], [x2], x3<br>
+ ld1 {v1.s}[0], [x4], x5<br>
+ urhadd v2.8b, v0.8b, v1.8b<br>
+ st1 {v2.s}[0], [x0], x1<br>
+.endr<br>
+ ret<br>
+endfunc<br>
+.endm<br>
+<br>
+pixel_avg_pp_4xN_neon 4<br>
+pixel_avg_pp_4xN_neon 8<br>
+pixel_avg_pp_4xN_neon 16<br>
+<br>
+.macro pixel_avg_pp_8xN_neon h<br>
+function x265_pixel_avg_pp_8x\h\()_neon<br>
+.rept \h<br>
+ ld1 {v0.8b}, [x2], x3<br>
+ ld1 {v1.8b}, [x4], x5<br>
+ urhadd v2.8b, v0.8b, v1.8b<br>
+ st1 {v2.8b}, [x0], x1<br>
+.endr<br>
+ ret<br>
+endfunc<br>
+.endm<br>
+<br>
+pixel_avg_pp_8xN_neon 4<br>
+pixel_avg_pp_8xN_neon 8<br>
+pixel_avg_pp_8xN_neon 16<br>
+pixel_avg_pp_8xN_neon 32<br>
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S<br>
new file mode 100644<br>
index 000000000..a085ebdfa<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel-util.S<br>
@@ -0,0 +1,419 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ * Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro x265_satd_4x8_8x4_end_neon<br>
+ add v0.8h, v4.8h, v6.8h<br>
+ add v1.8h, v5.8h, v7.8h<br>
+ sub v2.8h, v4.8h, v6.8h<br>
+ sub v3.8h, v5.8h, v7.8h<br>
+<br>
+ trn1 v16.8h, v0.8h, v1.8h<br>
+ trn2 v17.8h, v0.8h, v1.8h<br>
+ add v4.8h, v16.8h, v17.8h<br>
+ trn1 v18.8h, v2.8h, v3.8h<br>
+ trn2 v19.8h, v2.8h, v3.8h<br>
+ sub v5.8h, v16.8h, v17.8h<br>
+ add v6.8h, v18.8h, v19.8h<br>
+ sub v7.8h, v18.8h, v19.8h<br>
+ trn1 v0.4s, v4.4s, v6.4s<br>
+ trn2 v2.4s, v4.4s, v6.4s<br>
+ abs v0.8h, v0.8h<br>
+ trn1 v1.4s, v5.4s, v7.4s<br>
+ trn2 v3.4s, v5.4s, v7.4s<br>
+ abs v2.8h, v2.8h<br>
+ abs v1.8h, v1.8h<br>
+ abs v3.8h, v3.8h<br>
+ umax v0.8h, v0.8h, v2.8h<br>
+ umax v1.8h, v1.8h, v3.8h<br>
+ add v0.8h, v0.8h, v1.8h<br>
+ uaddlv s0, v0.8h<br>
+.endm<br>
+<br>
+.macro pixel_satd_4x8_neon<br>
+ ld1r {v1.2s}, [x2], x3<br>
+ ld1r {v0.2s}, [x0], x1<br>
+ ld1r {v3.2s}, [x2], x3<br>
+ ld1r {v2.2s}, [x0], x1<br>
+ ld1r {v5.2s}, [x2], x3<br>
+ ld1r {v4.2s}, [x0], x1<br>
+ ld1r {v7.2s}, [x2], x3<br>
+ ld1r {v6.2s}, [x0], x1<br>
+<br>
+ ld1 {v1.s}[1], [x2], x3<br>
+ ld1 {v0.s}[1], [x0], x1<br>
+ usubl v0.8h, v0.8b, v1.8b<br>
+ ld1 {v3.s}[1], [x2], x3<br>
+ ld1 {v2.s}[1], [x0], x1<br>
+ usubl v1.8h, v2.8b, v3.8b<br>
+ ld1 {v5.s}[1], [x2], x3<br>
+ ld1 {v4.s}[1], [x0], x1<br>
+ usubl v2.8h, v4.8b, v5.8b<br>
+ ld1 {v7.s}[1], [x2], x3<br>
+ add v4.8h, v0.8h, v1.8h<br>
+ sub v5.8h, v0.8h, v1.8h<br>
+ ld1 {v6.s}[1], [x0], x1<br>
+ usubl v3.8h, v6.8b, v7.8b<br>
+ add v6.8h, v2.8h, v3.8h<br>
+ sub v7.8h, v2.8h, v3.8h<br>
+ x265_satd_4x8_8x4_end_neon<br>
+.endm<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x8_neon<br>
+ pixel_satd_4x8_neon<br>
+ mov w0, v0.s[0]<br>
+ ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x16_neon<br>
+ eor w4, w4, w4<br>
+ pixel_satd_4x8_neon<br>
+ mov w5, v0.s[0]<br>
+ add w4, w4, w5<br>
+ pixel_satd_4x8_neon<br>
+ mov w5, v0.s[0]<br>
+ add w0, w5, w4<br>
+ ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x32_neon<br>
+ eor w4, w4, w4<br>
+.rept 4<br>
+ pixel_satd_4x8_neon<br>
+ mov w5, v0.s[0]<br>
+ add w4, w4, w5<br>
+.endr<br>
+ mov w0, w4<br>
+ ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_12x16_neon<br>
+ mov x4, x0<br>
+ mov x5, x2<br>
+ eor w7, w7, w7<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+<br>
+ add x0, x4, #4<br>
+ add x2, x5, #4<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+<br>
+ add x0, x4, #8<br>
+ add x2, x5, #8<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w0, w7, w6<br>
+ ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_12x32_neon<br>
+ mov x4, x0<br>
+ mov x5, x2<br>
+ eor w7, w7, w7<br>
+.rept 4<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+.endr<br>
+<br>
+ add x0, x4, #4<br>
+ add x2, x5, #4<br>
+.rept 4<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+.endr<br>
+<br>
+ add x0, x4, #8<br>
+ add x2, x5, #8<br>
+.rept 4<br>
+ pixel_satd_4x8_neon<br>
+ mov w6, v0.s[0]<br>
+ add w7, w7, w6<br>
+.endr<br>
+<br>
+ mov w0, w7<br>
+ ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_8x8_neon<br>
+ eor w4, w4, w4<br>
+ mov x6, x0<br>
+ mov x7, x2<br>
+ pixel_satd_4x8_neon<br>
+ mov w5, v0.s[0]<br>
+ add w4, w4, w5<br>
+ add x0, x6, #4<br>
+ add x2, x7, #4<br>
+ pixel_satd_4x8_neon<br>
+ mov w5, v0.s[0]<br>
+ add w0, w4, w5<br>
+ ret<br>
+endfunc<br>
+<br>
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)<br>
+function x265_psyCost_4x4_neon<br>
+ ld1r {v4.2s}, [x0], x1<br>
+ ld1r {v5.2s}, [x0], x1<br>
+ ld1 {v4.s}[1], [x0], x1<br>
+ ld1 {v5.s}[1], [x0], x1<br>
+<br>
+ ld1r {v6.2s}, [x2], x3<br>
+ ld1r {v7.2s}, [x2], x3<br>
+ ld1 {v6.s}[1], [x2], x3<br>
+ ld1 {v7.s}[1], [x2], x3<br>
+<br>
+ uaddl v2.8h, v4.8b, v5.8b<br>
+ usubl v3.8h, v4.8b, v5.8b<br>
+ uaddl v18.8h, v6.8b, v7.8b<br>
+ usubl v19.8h, v6.8b, v7.8b<br>
+<br>
+ mov v20.d[0], v2.d[1]<br>
+ add v0.4h, v2.4h, v20.4h<br>
+ sub v1.4h, v2.4h, v20.4h<br>
+ mov v21.d[0], v3.d[1]<br>
+ add v22.4h, v3.4h, v21.4h<br>
+ sub v23.4h, v3.4h, v21.4h<br>
+<br>
+ mov v24.d[0], v18.d[1]<br>
+ add v16.4h, v18.4h, v24.4h<br>
+ sub v17.4h, v18.4h, v24.4h<br>
+ mov v25.d[0], v19.d[1]<br>
+ add v26.4h, v19.4h, v25.4h<br>
+ sub v27.4h, v19.4h, v25.4h<br>
+<br>
+ mov v0.d[1], v22.d[0]<br>
+ mov v1.d[1], v23.d[0]<br>
+ trn1 v22.8h, v0.8h, v1.8h<br>
+ trn2 v23.8h, v0.8h, v1.8h<br>
+ mov v16.d[1], v26.d[0]<br>
+ mov v17.d[1], v27.d[0]<br>
+ trn1 v26.8h, v16.8h, v17.8h<br>
+ trn2 v27.8h, v16.8h, v17.8h<br>
+<br>
+ add v2.8h, v22.8h, v23.8h<br>
+ sub v3.8h, v22.8h, v23.8h<br>
+ add v18.8h, v26.8h, v27.8h<br>
+ sub v19.8h, v26.8h, v27.8h<br>
+<br>
+ uaddl v20.8h, v4.8b, v5.8b<br>
+ uaddl v21.8h, v6.8b, v7.8b<br>
+<br>
+ trn1 v0.4s, v2.4s, v3.4s<br>
+ trn2 v1.4s, v2.4s, v3.4s<br>
+ trn1 v16.4s, v18.4s, v19.4s<br>
+ trn2 v17.4s, v18.4s, v19.4s<br>
+ abs v0.8h, v0.8h<br>
+ abs v16.8h, v16.8h<br>
+ abs v1.8h, v1.8h<br>
+ abs v17.8h, v17.8h<br>
+<br>
+ uaddlv s20, v20.8h<br>
+ uaddlv s21, v21.8h<br>
+ mov v20.s[1], v21.s[0]<br>
+<br>
+ smax v0.8h, v0.8h, v1.8h<br>
+ smax v16.8h, v16.8h, v17.8h<br>
+<br>
+ trn1 v4.2d, v0.2d, v16.2d<br>
+ trn2 v5.2d, v0.2d, v16.2d<br>
+ add v0.8h, v4.8h, v5.8h<br>
+ mov v4.d[0], v0.d[1]<br>
+ uaddlv s0, v0.4h<br>
+ uaddlv s4, v4.4h<br>
+<br>
+ ushr v20.2s, v20.2s, #2<br>
+ mov v0.s[1], v4.s[0]<br>
+ sub v0.2s, v0.2s, v20.2s<br>
+ mov w0, v0.s[0]<br>
+ mov w1, v0.s[1]<br>
+ subs w0, w0, w1<br>
+ cneg w0, w0, mi<br>
+<br>
+ ret<br>
+endfunc<br>
+<br>
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)<br>
+function x265_quant_neon<br>
+ mov w9, #1<br>
+ lsl w9, w9, w4<br>
+ dup v0.2s, w9<br>
+ neg w9, w4<br>
+ dup v1.4s, w9<br>
+ add w9, w9, #8<br>
+ dup v2.4s, w9<br>
+ dup v3.4s, w5<br>
+<br>
+ lsr w6, w6, #2<br>
+ eor v4.16b, v4.16b, v4.16b<br>
+ eor w10, w10, w10<br>
+ eor v17.16b, v17.16b, v17.16b<br>
+<br>
+.loop_quant:<br>
+<br>
+ ld1 {v18.4h}, [x0], #8<br>
+ ld1 {v7.4s}, [x1], #16<br>
+ sxtl v6.4s, v18.4h<br>
+<br>
+ cmlt v5.4s, v6.4s, #0<br>
+<br>
+ abs v6.4s, v6.4s<br>
+<br>
+<br>
+ mul v6.4s, v6.4s, v7.4s<br>
+<br>
+ add v7.4s, v6.4s, v3.4s<br>
+ sshl v7.4s, v7.4s, v1.4s<br>
+<br>
+ mls v6.4s, v7.4s, v0.s[0]<br>
+ sshl v16.4s, v6.4s, v2.4s<br>
+ st1 {v16.4s}, [x2], #16<br>
+<br>
+ // numsig<br>
+ cmeq v16.4s, v7.4s, v17.4s<br>
+ add v4.4s, v4.4s, v16.4s<br>
+ add w10, w10, #4<br>
+<br>
+ // level *= sign<br>
+ eor v16.16b, v7.16b, v5.16b<br>
+ sub v16.4s, v16.4s, v5.4s<br>
+ sqxtn v5.4h, v16.4s<br>
+ st1 {v5.4h}, [x3], #8<br>
+<br>
+ subs w6, w6, #1<br>
+ <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a> .loop_quant<br>
+<br>
+ addv s4, v4.4s<br>
+ mov w9, v4.s[0]<br>
+ add w0, w10, w9<br>
+ ret<br>
+endfunc<br>
+<br>
+.macro satd_4x4_neon<br>
+ ld1 {v1.s}[0], [x2], x3<br>
+ ld1 {v0.s}[0], [x0], x1<br>
+ ld1 {v3.s}[0], [x2], x3<br>
+ ld1 {v2.s}[0], [x0], x1<br>
+<br>
+ ld1 {v1.s}[1], [x2], x3<br>
+ ld1 {v0.s}[1], [x0], x1<br>
+ ld1 {v3.s}[1], [x2], x3<br>
+ ld1 {v2.s}[1], [x0], x1<br>
+<br>
+ usubl v4.8h, v0.8b, v1.8b<br>
+ usubl v5.8h, v2.8b, v3.8b<br>
+<br>
+ add v6.8h, v4.8h, v5.8h<br>
+ sub v7.8h, v4.8h, v5.8h<br>
+<br>
+ mov v4.d[0], v6.d[1]<br>
+ add v0.8h, v6.8h, v4.8h<br>
+ sub v2.8h, v6.8h, v4.8h<br>
+<br>
+ mov v5.d[0], v7.d[1]<br>
+ add v1.8h, v7.8h, v5.8h<br>
+ sub v3.8h, v7.8h, v5.8h<br>
+<br>
+ trn1 v4.4h, v0.4h, v1.4h<br>
+ trn2 v5.4h, v0.4h, v1.4h<br>
+<br>
+ trn1 v6.4h, v2.4h, v3.4h<br>
+ trn2 v7.4h, v2.4h, v3.4h<br>
+<br>
+ add v0.4h, v4.4h, v5.4h<br>
+ sub v1.4h, v4.4h, v5.4h<br>
+<br>
+ add v2.4h, v6.4h, v7.4h<br>
+ sub v3.4h, v6.4h, v7.4h<br>
+<br>
+ trn1 v4.2s, v0.2s, v1.2s<br>
+ trn2 v5.2s, v0.2s, v1.2s<br>
+<br>
+ trn1 v6.2s, v2.2s, v3.2s<br>
+ trn2 v7.2s, v2.2s, v3.2s<br>
+<br>
+ abs v4.4h, v4.4h<br>
+ abs v5.4h, v5.4h<br>
+ abs v6.4h, v6.4h<br>
+ abs v7.4h, v7.4h<br>
+<br>
+ smax v1.4h, v4.4h, v5.4h<br>
+ smax v2.4h, v6.4h, v7.4h<br>
+<br>
+ add v0.4h, v1.4h, v2.4h<br>
+ uaddlp v0.2s, v0.4h<br>
+ uaddlp v0.1d, v0.2s<br>
+.endm<br>
+<br>
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x4_neon<br>
+ satd_4x4_neon<br>
+ umov x0, v0.d[0]<br>
+ ret<br>
+endfunc<br>
+<br>
+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_8x4_neon<br>
+ mov x4, x0<br>
+ mov x5, x2<br>
+ satd_4x4_neon<br>
+ add x0, x4, #4<br>
+ add x2, x5, #4<br>
+ umov x6, v0.d[0]<br>
+ satd_4x4_neon<br>
+ umov x0, v0.d[0]<br>
+ add x0, x0, x6<br>
+ ret<br>
+endfunc<br>
diff --git a/source/common/aarch64/pixel-util.h b/source/common/aarch64/pixel-util.h<br>
new file mode 100644<br>
index 000000000..043488468<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel-util.h<br>
@@ -0,0 +1,40 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ * Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_PIXEL_UTIL_AARCH64_H<br>
+#define X265_PIXEL_UTIL_AARCH64_H<br>
+<br>
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+<br>
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br>
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>
+<br>
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H<br>
diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h<br>
new file mode 100644<br>
index 000000000..179c2f4ec<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel.h<br>
@@ -0,0 +1,105 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_I386_PIXEL_AARCH64_H<br>
+#define X265_I386_PIXEL_AARCH64_H<br>
+<br>
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+<br>
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+<br>
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+<br>
+#endif // ifndef X265_I386_PIXEL_AARCH64_H<br>
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S<br>
new file mode 100644<br>
index 000000000..c27cce5ce<br>
--- /dev/null<br>
+++ b/source/common/aarch64/sad-a.S<br>
@@ -0,0 +1,105 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro SAD_X_START_8 x<br>
+ ld1 {v0.8b}, [x0], x9<br>
+.if \x == 3<br>
+ ld1 {v1.8b}, [x1], x4<br>
+ ld1 {v2.8b}, [x2], x4<br>
+ ld1 {v3.8b}, [x3], x4<br>
+.elseif \x == 4<br>
+ ld1 {v1.8b}, [x1], x5<br>
+ ld1 {v2.8b}, [x2], x5<br>
+ ld1 {v3.8b}, [x3], x5<br>
+ ld1 {v4.8b}, [x4], x5<br>
+.endif<br>
+ uabdl v16.8h, v0.8b, v1.8b<br>
+ uabdl v17.8h, v0.8b, v2.8b<br>
+ uabdl v18.8h, v0.8b, v3.8b<br>
+.if \x == 4<br>
+ uabdl v19.8h, v0.8b, v4.8b<br>
+.endif<br>
+.endm<br>
+<br>
+.macro SAD_X_8 x<br>
+ ld1 {v0.8b}, [x0], x9<br>
+.if \x == 3<br>
+ ld1 {v1.8b}, [x1], x4<br>
+ ld1 {v2.8b}, [x2], x4<br>
+ ld1 {v3.8b}, [x3], x4<br>
+.elseif \x == 4<br>
+ ld1 {v1.8b}, [x1], x5<br>
+ ld1 {v2.8b}, [x2], x5<br>
+ ld1 {v3.8b}, [x3], x5<br>
+ ld1 {v4.8b}, [x4], x5<br>
+.endif<br>
+ uabal v16.8h, v0.8b, v1.8b<br>
+ uabal v17.8h, v0.8b, v2.8b<br>
+ uabal v18.8h, v0.8b, v3.8b<br>
+.if \x == 4<br>
+ uabal v19.8h, v0.8b, v4.8b<br>
+.endif<br>
+.endm<br>
+<br>
+.macro SAD_X_8xN x, h<br>
+function x265_sad_x\x\()_8x\h\()_neon<br>
+ mov x9, #FENC_STRIDE<br>
+ SAD_X_START_8 \x<br>
+.rept \h - 1<br>
+ SAD_X_8 \x<br>
+.endr<br>
+ uaddlv s0, v16.8h<br>
+ uaddlv s1, v17.8h<br>
+ uaddlv s2, v18.8h<br>
+.if \x == 4<br>
+ uaddlv s3, v19.8h<br>
+.endif<br>
+<br>
+.if \x == 3<br>
+ stp s0, s1, [x5]<br>
+ str s2, [x5, #8]<br>
+.elseif \x == 4<br>
+ stp s0, s1, [x6]<br>
+ stp s2, s3, [x6, #8]<br>
+.endif<br>
+ ret<br>
+endfunc<br>
+.endm<br>
+<br>
+SAD_X_8xN 3 4<br>
+SAD_X_8xN 3 8<br>
+SAD_X_8xN 3 16<br>
+SAD_X_8xN 3 32<br>
+<br>
+SAD_X_8xN 4 4<br>
+SAD_X_8xN 4 8<br>
+SAD_X_8xN 4 16<br>
+SAD_X_8xN 4 32<br>
-- <br>
2.21.0.windows.1<br>
<br>
</div></div>