<div dir="ltr"><div class="gmail_quote">From: wangxiyuan <<a href="mailto:wangxiyuan@huawei.com" target="_blank">wangxiyuan@huawei.com</a>><br>
<br>
This patch add some common assembly optimization function for aarch64<br>
platform. These function won't work until the patch Part 2 is merged.<br>
---<br>
 source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++<br>
 source/common/aarch64/asm.S       | 69 ++++<br>
 source/common/aarch64/ipfilter8.S    | 414 ++++++++++++++++++++++<br>
 source/common/aarch64/ipfilter8.h    | 55 +++<br>
 source/common/aarch64/mc-a.S       | 63 ++++<br>
 source/common/aarch64/pixel-util.S    | 419 +++++++++++++++++++++++<br>
 source/common/aarch64/pixel-util.h    | 40 +++<br>
 source/common/aarch64/pixel.h      | 105 ++++++<br>
 source/common/aarch64/sad-a.S      | 105 ++++++<br>
 9 files changed, 1489 insertions(+)<br>
 create mode 100644 source/common/aarch64/asm-primitives.cpp<br>
 create mode 100644 source/common/aarch64/asm.S<br>
 create mode 100644 source/common/aarch64/ipfilter8.S<br>
 create mode 100644 source/common/aarch64/ipfilter8.h<br>
 create mode 100644 source/common/aarch64/mc-a.S<br>
 create mode 100644 source/common/aarch64/pixel-util.S<br>
 create mode 100644 source/common/aarch64/pixel-util.h<br>
 create mode 100644 source/common/aarch64/pixel.h<br>
 create mode 100644 source/common/aarch64/sad-a.S<br>
<br>
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp<br>
new file mode 100644<br>
index 000000000..6fe8c968c<br>
--- /dev/null<br>
+++ b/source/common/aarch64/asm-primitives.cpp<br>
@@ -0,0 +1,219 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *Â Â Â Â Â Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "common.h"<br>
+#include "primitives.h"<br>
+#include "x265.h"<br>
+#include "cpu.h"<br>
+<br>
+<br>
+#if defined(__GNUC__)<br>
+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)<br>
+#endif<br>
+<br>
+#define GCC_4_9_0 40900<br>
+#define GCC_5_1_0 50100<br>
+<br>
+extern "C" {<br>
+#include "pixel.h"<br>
+#include "pixel-util.h"<br>
+#include "ipfilter8.h"<br>
+}<br>
+<br>
+namespace X265_NS {<br>
+// private x265 namespace<br>
+<br>
+<br>
+template<int size><br>
+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)<br>
+{<br>
+Â Â ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);<br>
+Â Â const int halfFilterSize = NTAPS_LUMA >> 1;<br>
+Â Â const int immedStride = MAX_CU_SIZE;<br>
+<br>
+Â Â primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);<br>
+Â Â primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);<br>
+}<br>
+<br>
+<br>
+/* Temporary workaround because luma_vsp assembly primitive has not been completed<br>
+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.<br>
+ * Otherwise, segment fault occurs. */<br>
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)<br>
+{<br>
+Â Â if (cpuMask & X265_CPU_NEON)<br>
+Â Â {<br>
+    asmp.pu[LUMA_8x4].luma_vsp  = cp.pu[LUMA_8x4].luma_vsp;<br>
+    asmp.pu[LUMA_8x8].luma_vsp  = cp.pu[LUMA_8x8].luma_vsp;<br>
+    asmp.pu[LUMA_8x16].luma_vsp = cp.pu[LUMA_8x16].luma_vsp;<br>
+    asmp.pu[LUMA_8x32].luma_vsp = cp.pu[LUMA_8x32].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+    asmp.pu[LUMA_16x4].luma_vsp = cp.pu[LUMA_16x4].luma_vsp;<br>
+    asmp.pu[LUMA_16x8].luma_vsp = cp.pu[LUMA_16x8].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;Â Â <br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>
+    asmp.pu[LUMA_4x4].luma_vsp  = cp.pu[LUMA_4x4].luma_vsp;<br>
+    asmp.pu[LUMA_4x8].luma_vsp  = cp.pu[LUMA_4x8].luma_vsp;<br>
+    asmp.pu[LUMA_4x16].luma_vsp = cp.pu[LUMA_4x16].luma_vsp;<br>
+Â Â Â Â asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;<br>
+    asmp.pu[LUMA_32x8].luma_vsp = cp.pu[LUMA_32x8].luma_vsp;<br>
+#endif<br>
+#endif<br>
+Â Â }<br>
+}<br>
+<br>
+<br>
+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) <br>
+{<br>
+Â Â if (cpuMask & X265_CPU_NEON)<br>
+Â Â {<br>
+    p.pu[LUMA_4x4].satd  = PFX(pixel_satd_4x4_neon);<br>
+    p.pu[LUMA_4x8].satd  = PFX(pixel_satd_4x8_neon);<br>
+    p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);<br>
+    p.pu[LUMA_8x4].satd  = PFX(pixel_satd_8x4_neon);<br>
+    p.pu[LUMA_8x8].satd  = PFX(pixel_satd_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);<br>
+Â Â Â Â <br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd  = PFX(pixel_satd_4x4_neon);<br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd  = PFX(pixel_satd_4x8_neon);<br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd  = PFX(pixel_satd_4x16_neon);<br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd  = PFX(pixel_satd_8x4_neon);<br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd  = PFX(pixel_satd_8x8_neon);<br>
+    p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);<br>
+Â Â Â Â <br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd  = PFX(pixel_satd_4x4_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd  = PFX(pixel_satd_4x8_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd  = PFX(pixel_satd_4x16_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd  = PFX(pixel_satd_4x32_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd  = PFX(pixel_satd_8x4_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd  = PFX(pixel_satd_8x8_neon);<br>
+    p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);<br>
+<br>
+Â Â Â Â p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_4x4_neon);<br>
+Â Â Â Â p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_4x8_neon);<br>
+Â Â Â Â p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_4x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_8x32_neon);<br>
+<br>
+Â Â Â Â p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]Â Â = PFX(pixel_avg_pp_4x4_neon);<br>
+Â Â Â Â p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]Â Â = PFX(pixel_avg_pp_4x8_neon);<br>
+Â Â Â Â p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]Â = PFX(pixel_avg_pp_4x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]Â Â = PFX(pixel_avg_pp_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]Â Â = PFX(pixel_avg_pp_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]Â = PFX(pixel_avg_pp_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]Â = PFX(pixel_avg_pp_8x32_neon);<br>
+<br>
+Â Â Â Â p.pu[LUMA_8x4].sad_x3Â Â = PFX(sad_x3_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].sad_x3Â Â = PFX(sad_x3_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].sad_x3Â = PFX(sad_x3_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].sad_x3Â = PFX(sad_x3_8x32_neon);<br>
+<br>
+Â Â Â Â p.pu[LUMA_8x4].sad_x4Â Â = PFX(sad_x4_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].sad_x4Â Â = PFX(sad_x4_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].sad_x4Â = PFX(sad_x4_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].sad_x4Â = PFX(sad_x4_8x32_neon);<br>
+<br>
+Â Â Â Â // quant<br>
+Â Â Â Â p.quant = PFX(quant_neon);<br>
+Â Â Â Â // luma_hps<br>
+    p.pu[LUMA_4x4].luma_hps  = PFX(interp_8tap_horiz_ps_4x4_neon);<br>
+    p.pu[LUMA_4x8].luma_hps  = PFX(interp_8tap_horiz_ps_4x8_neon);<br>
+    p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_neon);<br>
+    p.pu[LUMA_8x4].luma_hps  = PFX(interp_8tap_horiz_ps_8x4_neon);<br>
+    p.pu[LUMA_8x8].luma_hps  = PFX(interp_8tap_horiz_ps_8x8_neon);<br>
+    p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_neon);<br>
+    p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_neon);<br>
+Â Â Â Â p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);<br>
+Â Â Â Â p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+    p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_neon);<br>
+    p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_neon);<br>
+Â Â Â Â p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);<br>
+Â Â Â Â p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);<br>
+Â Â Â Â p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);<br>
+Â Â Â Â p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);<br>
+    p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_neon);<br>
+Â Â Â Â p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);<br>
+Â Â Â Â p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);<br>
+Â Â Â Â p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);<br>
+Â Â Â Â p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);<br>
+Â Â Â Â p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);<br>
+Â Â Â Â p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);<br>
+Â Â Â Â p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);<br>
+Â Â Â Â p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);<br>
+Â Â Â Â p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);<br>
+#endif<br>
+<br>
+    p.pu[LUMA_8x4].luma_hvpp  = interp_8tap_hv_pp_cpu<LUMA_8x4>;<br>
+    p.pu[LUMA_8x8].luma_hvpp  = interp_8tap_hv_pp_cpu<LUMA_8x8>;<br>
+    p.pu[LUMA_8x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x16>;<br>
+    p.pu[LUMA_8x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x32>;<br>
+Â Â Â Â p.pu[LUMA_12x16].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_12x16>;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>
+    p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;<br>
+    p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;<br>
+Â Â Â Â p.pu[LUMA_16x12].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_16x12>;<br>
+Â Â Â Â p.pu[LUMA_16x16].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_16x16>;<br>
+Â Â Â Â p.pu[LUMA_16x32].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_16x32>;<br>
+Â Â Â Â p.pu[LUMA_16x64].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_16x64>;<br>
+Â Â Â Â p.pu[LUMA_32x16].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_32x16>;<br>
+Â Â Â Â p.pu[LUMA_32x24].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_32x24>;<br>
+Â Â Â Â p.pu[LUMA_32x32].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_32x32>;<br>
+Â Â Â Â p.pu[LUMA_32x64].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_32x64>;<br>
+Â Â Â Â p.pu[LUMA_48x64].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_48x64>;<br>
+Â Â Â Â p.pu[LUMA_64x16].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_64x16>;<br>
+Â Â Â Â p.pu[LUMA_64x32].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_64x32>;<br>
+Â Â Â Â p.pu[LUMA_64x48].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_64x48>;<br>
+Â Â Â Â p.pu[LUMA_64x64].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_64x64>;<br>
+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>
+    p.pu[LUMA_4x4].luma_hvpp  = interp_8tap_hv_pp_cpu<LUMA_4x4>;<br>
+    p.pu[LUMA_4x8].luma_hvpp  = interp_8tap_hv_pp_cpu<LUMA_4x8>;<br>
+    p.pu[LUMA_4x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x16>;<br>
+Â Â Â Â p.pu[LUMA_24x32].luma_hvpp =Â interp_8tap_hv_pp_cpu<LUMA_24x32>;<br>
+    p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;<br>
+#endif<br>
+#endif<br>
+<br>
+#if !HIGH_BIT_DEPTH<br>
+Â Â Â Â <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);<br>
+#endif // !HIGH_BIT_DEPTH<br>
+<br>
+Â Â }<br>
+}<br>
+} // namespace X265_NS<br>
diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S<br>
new file mode 100644<br>
index 000000000..5f020a11a<br>
--- /dev/null<br>
+++ b/source/common/aarch64/asm.S<br>
@@ -0,0 +1,69 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+.arch      armv8-a<br>
+<br>
+#ifdef PREFIX<br>
+#define EXTERN_ASM _<br>
+#else<br>
+#define EXTERN_ASM<br>
+#endif<br>
+<br>
+#ifdef __ELF__<br>
+#define ELF<br>
+#else<br>
+#define ELF @<br>
+#endif<br>
+<br>
+#define HAVE_AS_FUNC 1<br>
+<br>
+#if HAVE_AS_FUNC<br>
+#define FUNC<br>
+#else<br>
+#define FUNC @<br>
+#endif<br>
+<br>
+.macro function name, export=1<br>
+Â Â .macro endfunc<br>
+ELF   .size  \name, . - \name<br>
+FUNCÂ Â .endfunc<br>
+Â Â Â Â .purgem endfunc<br>
+Â Â .endm<br>
+    .align 2<br>
+.if \export == 1<br>
+Â Â Â Â .global EXTERN_ASM\name<br>
+ELFÂ Â Â .hidden EXTERN_ASM\name<br>
+ELF   .type  EXTERN_ASM\name, %function<br>
+FUNC  .func  EXTERN_ASM\name<br>
+EXTERN_ASM\name:<br>
+.else<br>
+ELFÂ Â Â .hidden \name<br>
+ELF   .type  \name, %function<br>
+FUNC  .func  \name<br>
+\name:<br>
+.endif<br>
+.endm<br>
+<br>
+<br>
+#define FENC_STRIDE 64<br>
+#define FDEC_STRIDE 32<br>
diff --git a/source/common/aarch64/ipfilter8.S b/source/common/aarch64/ipfilter8.S<br>
new file mode 100644<br>
index 000000000..908c7db46<br>
--- /dev/null<br>
+++ b/source/common/aarch64/ipfilter8.S<br>
@@ -0,0 +1,414 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+<br>
+<br>
+.macro qpel_filter_0_32b<br>
+  movi      v24.8h, #64<br>
+  uxtl      v19.8h, v5.8b<br>
+  smull      v17.4s, v19.4h, v24.4h<br>
+Â Â smull2Â Â Â Â Â v18.4s, v19.8h, v24.8h<br>
+.endm<br>
+<br>
+.macro qpel_filter_1_32b<br>
+  movi      v16.8h, #58<br>
+  uxtl      v19.8h, v5.8b<br>
+  smull      v17.4s, v19.4h, v16.4h<br>
+Â Â smull2Â Â Â Â Â v18.4s, v19.8h, v16.8h<br>
+<br>
+  movi      v24.8h, #10<br>
+  uxtl      v21.8h, v1.8b<br>
+  smull      v19.4s, v21.4h, v24.4h<br>
+Â Â smull2Â Â Â Â Â v20.4s, v21.8h, v24.8h<br>
+<br>
+  movi      v16.8h, #17<br>
+  uxtl      v23.8h, v2.8b<br>
+  smull      v21.4s, v23.4h, v16.4h<br>
+Â Â smull2Â Â Â Â Â v22.4s, v23.8h, v16.8h<br>
+<br>
+  movi      v24.8h, #5<br>
+  uxtl      v1.8h, v6.8b<br>
+  smull      v23.4s, v1.4h, v24.4h<br>
+Â Â smull2Â Â Â Â Â v16.4s, v1.8h, v24.8h<br>
+<br>
+  sub       v17.4s, v17.4s, v19.4s<br>
+  sub       v18.4s, v18.4s, v20.4s<br>
+<br>
+  uxtl      v1.8h, v4.8b<br>
+  sshll      v19.4s, v1.4h, #2<br>
+Â Â sshll2Â Â Â Â Â v20.4s, v1.8h, #2<br>
+<br>
+  add       v17.4s, v17.4s, v21.4s<br>
+  add       v18.4s, v18.4s, v22.4s<br>
+<br>
+  uxtl      v1.8h, v0.8b<br>
+  uxtl      v2.8h, v3.8b<br>
+  ssubl      v21.4s, v2.4h, v1.4h<br>
+Â Â ssubl2Â Â Â Â Â v22.4s, v2.8h, v1.8h<br>
+<br>
+  add       v17.4s, v17.4s, v19.4s<br>
+  add       v18.4s, v18.4s, v20.4s<br>
+  sub       v21.4s, v21.4s, v23.4s<br>
+  sub       v22.4s, v22.4s, v16.4s<br>
+  add       v17.4s, v17.4s, v21.4s<br>
+  add       v18.4s, v18.4s, v22.4s<br>
+.endm<br>
+<br>
+.macro qpel_filter_2_32b<br>
+  movi      v16.4s, #11<br>
+  uxtl      v19.8h, v5.8b<br>
+  uxtl      v20.8h, v2.8b<br>
+  saddl      v17.4s, v19.4h, v20.4h<br>
+Â Â saddl2Â Â Â Â Â v18.4s, v19.8h, v20.8h<br>
+<br>
+  uxtl      v21.8h, v1.8b<br>
+  uxtl      v22.8h, v6.8b<br>
+  saddl      v19.4s, v21.4h, v22.4h<br>
+Â Â saddl2Â Â Â Â Â v20.4s, v21.8h, v22.8h<br>
+<br>
+  mul       v19.4s, v19.4s, v16.4s<br>
+  mul       v20.4s, v20.4s, v16.4s<br>
+<br>
+  movi      v16.4s, #40<br>
+  mul       v17.4s, v17.4s, v16.4s<br>
+  mul       v18.4s, v18.4s, v16.4s<br>
+<br>
+  uxtl      v21.8h, v4.8b<br>
+  uxtl      v22.8h, v3.8b<br>
+  saddl      v23.4s, v21.4h, v22.4h<br>
+Â Â saddl2Â Â Â Â Â v16.4s, v21.8h, v22.8h<br>
+<br>
+  uxtl      v1.8h, v0.8b<br>
+  uxtl      v2.8h, v7.8b<br>
+  saddl      v21.4s, v1.4h, v2.4h<br>
+Â Â saddl2Â Â Â Â Â v22.4s, v1.8h, v2.8h<br>
+<br>
+  shl       v23.4s, v23.4s, #2<br>
+  shl       v16.4s, v16.4s, #2<br>
+<br>
+  add       v19.4s, v19.4s, v21.4s<br>
+  add       v20.4s, v20.4s, v22.4s<br>
+  add       v17.4s, v17.4s, v23.4s<br>
+  add       v18.4s, v18.4s, v16.4s<br>
+  sub       v17.4s, v17.4s, v19.4s<br>
+  sub       v18.4s, v18.4s, v20.4s<br>
+.endm<br>
+<br>
+.macro qpel_filter_3_32b<br>
+  movi      v16.8h, #17<br>
+  movi      v24.8h, #5<br>
+<br>
+  uxtl      v19.8h, v5.8b<br>
+  smull      v17.4s, v19.4h, v16.4h<br>
+Â Â smull2Â Â Â Â Â v18.4s, v19.8h, v16.8h<br>
+<br>
+  uxtl      v21.8h, v1.8b<br>
+  smull      v19.4s, v21.4h, v24.4h<br>
+Â Â smull2Â Â Â Â Â v20.4s, v21.8h, v24.8h<br>
+<br>
+  movi      v16.8h, #58<br>
+  uxtl      v23.8h, v2.8b<br>
+  smull      v21.4s, v23.4h, v16.4h<br>
+Â Â smull2Â Â Â Â Â v22.4s, v23.8h, v16.8h<br>
+<br>
+  movi      v24.8h, #10<br>
+  uxtl      v1.8h, v6.8b<br>
+  smull      v23.4s, v1.4h, v24.4h<br>
+Â Â smull2Â Â Â Â Â v16.4s, v1.8h, v24.8h<br>
+<br>
+  sub       v17.4s, v17.4s, v19.4s<br>
+  sub       v18.4s, v18.4s, v20.4s<br>
+<br>
+  uxtl      v1.8h, v3.8b<br>
+  sshll      v19.4s, v1.4h, #2<br>
+Â Â sshll2Â Â Â Â Â v20.4s, v1.8h, #2<br>
+<br>
+  add       v17.4s, v17.4s, v21.4s<br>
+  add       v18.4s, v18.4s, v22.4s<br>
+<br>
+  uxtl      v1.8h, v4.8b<br>
+  uxtl      v2.8h, v7.8b<br>
+  ssubl      v21.4s, v1.4h, v2.4h<br>
+Â Â ssubl2Â Â Â Â Â v22.4s, v1.8h, v2.8h<br>
+<br>
+  add       v17.4s, v17.4s, v19.4s<br>
+  add       v18.4s, v18.4s, v20.4s<br>
+  sub       v21.4s, v21.4s, v23.4s<br>
+  sub       v22.4s, v22.4s, v16.4s<br>
+  add       v17.4s, v17.4s, v21.4s<br>
+  add       v18.4s, v18.4s, v22.4s<br>
+.endm<br>
+<br>
+<br>
+<br>
+<br>
+.macro vextin8<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.16b}, [x11], #16<br>
+  mov       v7.d[0], v3.d[1]<br>
+  ext       v0.8b, v3.8b, v7.8b, #1<br>
+  ext       v4.8b, v3.8b, v7.8b, #2<br>
+  ext       v1.8b, v3.8b, v7.8b, #3<br>
+  ext       v5.8b, v3.8b, v7.8b, #4<br>
+  ext       v2.8b, v3.8b, v7.8b, #5<br>
+  ext       v6.8b, v3.8b, v7.8b, #6<br>
+  ext       v3.8b, v3.8b, v7.8b, #7<br>
+.endm<br>
+<br>
+<br>
+<br>
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+.macro HPS_FILTER a b filterhps<br>
+  mov       w12, #8192<br>
+  mov       w6, w10<br>
+  sub       x3, x3, #\a<br>
+  lsl       x3, x3, #1<br>
+  mov       w9, #\a<br>
+  cmp       w9, #4<br>
+  b.eq      14f<br>
+  cmp       w9, #12<br>
+  b.eq      15f<br>
+  b        7f<br>
+14:<br>
+Â Â HPS_FILTER_4 \a \b \filterhps<br>
+  b        10f<br>
+15:<br>
+Â Â HPS_FILTER_12 \a \b \filterhps<br>
+  b        10f<br>
+7:<br>
+  cmp       w5, #0<br>
+  b.eq      8f<br>
+  cmp       w5, #1<br>
+  b.eq      9f<br>
+8:<br>
+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>
+  mov       w7, #\a<br>
+  lsr       w7, w7, #3<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  sub       v18.4s, v18.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â xtn2Â Â Â Â Â Â v0.8h, v18.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.8h}, [x2], #16<br>
+  subs      w7, w7, #1<br>
+  sub       x11, x11, #8<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop2_hps_\filterhps\()_\a\()x\b\()_rowext0<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+  add       x2, x2, x3<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop1_hps_\filterhps\()_\a\()x\b\()_rowext0<br>
+  b        10f<br>
+9:<br>
+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>
+  mov       w7, #\a<br>
+  lsr       w7, w7, #3<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  sub       v18.4s, v18.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â xtn2Â Â Â Â Â Â v0.8h, v18.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.8h}, [x2], #16<br>
+  subs      w7, w7, #1<br>
+  sub       x11, x11, #8<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop4_hps_\filterhps\()_\a\()x\b\()_rowext1<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+  add       x2, x2, x3<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop3_hps_\filterhps\()_\a\()x\b\()_rowext1<br>
+10:<br>
+.endm<br>
+<br>
+.macro HPS_FILTER_4 w h filterhps<br>
+  cmp       w5, #0<br>
+  b.eq      11f<br>
+  cmp       w5, #1<br>
+  b.eq      12f<br>
+11:<br>
+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.4h}, [x2], #8<br>
+  sub       x11, x11, #8<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+  add       x2, x2, x3<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop4_hps_\filterhps\()_\w\()x\h\()_rowext0<br>
+  b        13f<br>
+12:<br>
+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.4h}, [x2], #8<br>
+  sub       x11, x11, #8<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+  add       x2, x2, x3<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop5_hps_\filterhps\()_\w\()x\h\()_rowext1<br>
+13:<br>
+.endm<br>
+<br>
+.macro HPS_FILTER_12 w h filterhps<br>
+  cmp       w5, #0<br>
+  b.eq      14f<br>
+  cmp       w5, #1<br>
+  b.eq      15f<br>
+14:<br>
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  sub       v18.4s, v18.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â xtn2Â Â Â Â Â Â v0.8h, v18.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.8h}, [x2], #16<br>
+  sub       x11, x11, #8<br>
+<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.4h}, [x2], #8<br>
+  add       x2, x2, x3<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop12_hps_\filterhps\()_\w\()x\h\()_rowext0<br>
+  b        16f<br>
+15:<br>
+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>
+  mov       x11, x0<br>
+  sub       x11, x11, #4<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  sub       v18.4s, v18.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â xtn2Â Â Â Â Â Â v0.8h, v18.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.8h}, [x2], #16<br>
+  sub       x11, x11, #8<br>
+<br>
+Â Â vextin8<br>
+Â Â \filterhps<br>
+  dup       v16.4s, w12<br>
+  sub       v17.4s, v17.4s, v16.4s<br>
+  xtn       v0.4h, v17.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v0.4h}, [x2], #8<br>
+  add       x2, x2, x3<br>
+  subs      w6, w6, #1<br>
+  add       x0, x0, x1<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â loop12_hps_\filterhps\()_\w\()x\h\()_rowext1<br>
+16:<br>
+.endm<br>
+<br>
+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>
+.macro LUMA_HPS w h<br>
+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon<br>
+  mov       w10, #\h<br>
+  cmp       w5, #0<br>
+  b.eq      6f<br>
+  sub       x0, x0, x1, lsl #2<br>
+<br>
+  add       x0, x0, x1<br>
+  add       w10, w10, #7<br>
+6:<br>
+  cmp       w4, #0<br>
+  b.eq      0f<br>
+  cmp       w4, #1<br>
+  b.eq      1f<br>
+  cmp       w4, #2<br>
+  b.eq      2f<br>
+  cmp       w4, #3<br>
+  b.eq      3f<br>
+0:<br>
+Â Â HPS_FILTERÂ \w \h qpel_filter_0_32b<br>
+  b        5f<br>
+1:<br>
+Â Â HPS_FILTERÂ \w \h qpel_filter_1_32b<br>
+  b        5f<br>
+2:<br>
+Â Â HPS_FILTERÂ \w \h qpel_filter_2_32b<br>
+  b        5f<br>
+3:<br>
+Â Â HPS_FILTERÂ \w \h qpel_filter_3_32b<br>
+  b        5f<br>
+5:<br>
+Â Â ret<br>
+endfunc<br>
+.endm<br>
+<br>
+LUMA_HPSÂ Â 4 4<br>
+LUMA_HPSÂ Â 4 8<br>
+LUMA_HPSÂ Â 4 16<br>
+LUMA_HPSÂ Â 8 4<br>
+LUMA_HPSÂ Â 8 8<br>
+LUMA_HPSÂ Â 8 16<br>
+LUMA_HPSÂ Â 8 32<br>
+LUMA_HPSÂ Â 12 16<br>
+LUMA_HPSÂ Â 16 4<br>
+LUMA_HPSÂ Â 16 8<br>
+LUMA_HPSÂ Â 16 12<br>
+LUMA_HPSÂ Â 16 16<br>
+LUMA_HPSÂ Â 16 32<br>
+LUMA_HPSÂ Â 16 64<br>
+LUMA_HPSÂ Â 24 32<br>
+LUMA_HPSÂ Â 32 8<br>
+LUMA_HPSÂ Â 32 16<br>
+LUMA_HPSÂ Â 32 24<br>
+LUMA_HPSÂ Â 32 32<br>
+LUMA_HPSÂ Â 32 64<br>
+LUMA_HPSÂ Â 48 64<br>
+LUMA_HPSÂ Â 64 16<br>
+LUMA_HPSÂ Â 64 32<br>
+LUMA_HPSÂ Â 64 48<br>
+LUMA_HPSÂ Â 64 64<br>
diff --git a/source/common/aarch64/ipfilter8.h b/source/common/aarch64/ipfilter8.h<br>
new file mode 100644<br>
index 000000000..f9ed91e2e<br>
--- /dev/null<br>
+++ b/source/common/aarch64/ipfilter8.h<br>
@@ -0,0 +1,55 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_IPFILTER8_AARCH64_H<br>
+#define X265_IPFILTER8_AARCH64_H<br>
+<br>
+<br>
+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>
+<br>
+<br>
+#endif // ifndef X265_IPFILTER8_AARCH64_H<br>
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S<br>
new file mode 100644<br>
index 000000000..cbaf9b501<br>
--- /dev/null<br>
+++ b/source/common/aarch64/mc-a.S<br>
@@ -0,0 +1,63 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro pixel_avg_pp_4xN_neon h<br>
+function x265_pixel_avg_pp_4x\h\()_neon<br>
+.rept \h<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.s}[0], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.s}[0], [x4], x5<br>
+  urhadd     v2.8b, v0.8b, v1.8b<br>
+Â Â st1Â Â Â Â Â Â Â {v2.s}[0], [x0], x1<br>
+.endr<br>
+Â Â ret<br>
+endfunc<br>
+.endm<br>
+<br>
+pixel_avg_pp_4xN_neon 4<br>
+pixel_avg_pp_4xN_neon 8<br>
+pixel_avg_pp_4xN_neon 16<br>
+<br>
+.macro pixel_avg_pp_8xN_neon h<br>
+function x265_pixel_avg_pp_8x\h\()_neon<br>
+.rept \h<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.8b}, [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.8b}, [x4], x5<br>
+  urhadd     v2.8b, v0.8b, v1.8b<br>
+Â Â st1Â Â Â Â Â Â Â {v2.8b}, [x0], x1<br>
+.endr<br>
+Â Â ret<br>
+endfunc<br>
+.endm<br>
+<br>
+pixel_avg_pp_8xN_neon 4<br>
+pixel_avg_pp_8xN_neon 8<br>
+pixel_avg_pp_8xN_neon 16<br>
+pixel_avg_pp_8xN_neon 32<br>
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S<br>
new file mode 100644<br>
index 000000000..a085ebdfa<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel-util.S<br>
@@ -0,0 +1,419 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *Â Â Â Â Â Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro x265_satd_4x8_8x4_end_neon<br>
+  add       v0.8h, v4.8h, v6.8h<br>
+  add       v1.8h, v5.8h, v7.8h<br>
+  sub       v2.8h, v4.8h, v6.8h<br>
+  sub       v3.8h, v5.8h, v7.8h<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v16.8h, v0.8h, v1.8h<br>
+Â Â trn2Â Â Â Â Â Â v17.8h, v0.8h, v1.8h<br>
+  add       v4.8h, v16.8h, v17.8h<br>
+Â Â trn1Â Â Â Â Â Â v18.8h, v2.8h, v3.8h<br>
+Â Â trn2Â Â Â Â Â Â v19.8h, v2.8h, v3.8h<br>
+  sub       v5.8h, v16.8h, v17.8h<br>
+  add       v6.8h, v18.8h, v19.8h<br>
+  sub       v7.8h, v18.8h, v19.8h<br>
+Â Â trn1Â Â Â Â Â Â v0.4s, v4.4s, v6.4s<br>
+Â Â trn2Â Â Â Â Â Â v2.4s, v4.4s, v6.4s<br>
+  abs       v0.8h, v0.8h<br>
+Â Â trn1Â Â Â Â Â Â v1.4s, v5.4s, v7.4s<br>
+Â Â trn2Â Â Â Â Â Â v3.4s, v5.4s, v7.4s<br>
+  abs       v2.8h, v2.8h<br>
+  abs       v1.8h, v1.8h<br>
+  abs       v3.8h, v3.8h<br>
+  umax      v0.8h, v0.8h, v2.8h<br>
+  umax      v1.8h, v1.8h, v3.8h<br>
+  add       v0.8h, v0.8h, v1.8h<br>
+  uaddlv     s0, v0.8h<br>
+.endm<br>
+<br>
+.macro pixel_satd_4x8_neon<br>
+  ld1r       {v1.2s}, [x2], x3<br>
+  ld1r      {v0.2s}, [x0], x1<br>
+  ld1r      {v3.2s}, [x2], x3<br>
+  ld1r      {v2.2s}, [x0], x1<br>
+  ld1r      {v5.2s}, [x2], x3<br>
+  ld1r      {v4.2s}, [x0], x1<br>
+  ld1r      {v7.2s}, [x2], x3<br>
+  ld1r      {v6.2s}, [x0], x1<br>
+<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.s}[1], [x0], x1<br>
+  usubl      v0.8h, v0.8b, v1.8b<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.s}[1], [x0], x1<br>
+  usubl      v1.8h, v2.8b, v3.8b<br>
+Â Â ld1Â Â Â Â Â Â Â {v5.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v4.s}[1], [x0], x1<br>
+  usubl      v2.8h, v4.8b, v5.8b<br>
+Â Â ld1Â Â Â Â Â Â Â {v7.s}[1], [x2], x3<br>
+  add       v4.8h, v0.8h, v1.8h<br>
+  sub       v5.8h, v0.8h, v1.8h<br>
+Â Â ld1Â Â Â Â Â Â Â {v6.s}[1], [x0], x1<br>
+  usubl      v3.8h, v6.8b, v7.8b<br>
+  add     v6.8h, v2.8h, v3.8h<br>
+  sub     v7.8h, v2.8h, v3.8h<br>
+Â Â x265_satd_4x8_8x4_end_neon<br>
+.endm<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x8_neon<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov        w0, v0.s[0]<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x16_neon<br>
+  eor       w4, w4, w4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov        w5, v0.s[0]<br>
+  add       w4, w4, w5<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov        w5, v0.s[0]<br>
+  add       w0, w5, w4<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x32_neon<br>
+  eor       w4, w4, w4<br>
+.rept 4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w5, v0.s[0]<br>
+  add       w4, w4, w5<br>
+.endr<br>
+  mov       w0, w4<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_12x16_neon<br>
+  mov       x4, x0<br>
+  mov       x5, x2<br>
+  eor       w7, w7, w7<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+<br>
+  add       x0, x4, #4<br>
+  add       x2, x5, #4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+<br>
+  add       x0, x4, #8<br>
+  add       x2, x5, #8<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w0, w7, w6<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_12x32_neon<br>
+  mov       x4, x0<br>
+  mov       x5, x2<br>
+  eor       w7, w7, w7<br>
+.rept 4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+.endr<br>
+<br>
+  add       x0, x4, #4<br>
+  add       x2, x5, #4<br>
+.rept 4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+.endr<br>
+<br>
+  add       x0, x4, #8<br>
+  add       x2, x5, #8<br>
+.rept 4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w6, v0.s[0]<br>
+  add       w7, w7, w6<br>
+.endr<br>
+<br>
+  mov       w0, w7<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// template<int w, int h><br>
+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_8x8_neon<br>
+  eor       w4, w4, w4<br>
+  mov       x6, x0<br>
+  mov       x7, x2<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w5, v0.s[0]<br>
+  add       w4, w4, w5<br>
+  add       x0, x6, #4<br>
+  add       x2, x7, #4<br>
+Â Â pixel_satd_4x8_neon<br>
+  mov       w5, v0.s[0]<br>
+  add       w0, w4, w5<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)<br>
+function x265_psyCost_4x4_neon<br>
+  ld1r      {v4.2s}, [x0], x1<br>
+  ld1r      {v5.2s}, [x0], x1<br>
+Â Â ld1Â Â Â Â Â Â Â {v4.s}[1], [x0], x1<br>
+Â Â ld1Â Â Â Â Â Â Â {v5.s}[1], [x0], x1<br>
+<br>
+  ld1r      {v6.2s}, [x2], x3<br>
+  ld1r      {v7.2s}, [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v6.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v7.s}[1], [x2], x3<br>
+<br>
+  uaddl      v2.8h, v4.8b, v5.8b<br>
+  usubl      v3.8h, v4.8b, v5.8b<br>
+  uaddl      v18.8h, v6.8b, v7.8b<br>
+  usubl      v19.8h, v6.8b, v7.8b<br>
+<br>
+  mov       v20.d[0], v2.d[1]<br>
+  add       v0.4h, v2.4h, v20.4h<br>
+  sub       v1.4h, v2.4h, v20.4h<br>
+  mov       v21.d[0], v3.d[1]<br>
+  add       v22.4h, v3.4h, v21.4h<br>
+  sub       v23.4h, v3.4h, v21.4h<br>
+<br>
+  mov       v24.d[0], v18.d[1]<br>
+  add       v16.4h, v18.4h, v24.4h<br>
+  sub       v17.4h, v18.4h, v24.4h<br>
+  mov       v25.d[0], v19.d[1]<br>
+  add       v26.4h, v19.4h, v25.4h<br>
+  sub       v27.4h, v19.4h, v25.4h<br>
+<br>
+  mov       v0.d[1], v22.d[0]<br>
+  mov       v1.d[1], v23.d[0]<br>
+Â Â trn1Â Â Â Â Â Â v22.8h, v0.8h, v1.8h<br>
+Â Â trn2Â Â Â Â Â Â v23.8h, v0.8h, v1.8h<br>
+  mov       v16.d[1], v26.d[0]<br>
+  mov       v17.d[1], v27.d[0]<br>
+Â Â trn1Â Â Â Â Â Â v26.8h, v16.8h, v17.8h<br>
+Â Â trn2Â Â Â Â Â Â v27.8h, v16.8h, v17.8h<br>
+<br>
+  add       v2.8h, v22.8h, v23.8h<br>
+  sub       v3.8h, v22.8h, v23.8h<br>
+  add       v18.8h, v26.8h, v27.8h<br>
+  sub       v19.8h, v26.8h, v27.8h<br>
+<br>
+  uaddl      v20.8h, v4.8b, v5.8b<br>
+  uaddl      v21.8h, v6.8b, v7.8b<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v0.4s, v2.4s, v3.4s<br>
+Â Â trn2Â Â Â Â Â Â v1.4s, v2.4s, v3.4s<br>
+Â Â trn1Â Â Â Â Â Â v16.4s, v18.4s, v19.4s<br>
+Â Â trn2Â Â Â Â Â Â v17.4s, v18.4s, v19.4s<br>
+  abs       v0.8h, v0.8h<br>
+  abs       v16.8h, v16.8h<br>
+  abs       v1.8h, v1.8h<br>
+  abs       v17.8h, v17.8h<br>
+<br>
+  uaddlv     s20, v20.8h<br>
+  uaddlv     s21, v21.8h<br>
+  mov       v20.s[1], v21.s[0]<br>
+<br>
+  smax      v0.8h, v0.8h, v1.8h<br>
+  smax      v16.8h, v16.8h, v17.8h<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v4.2d, v0.2d, v16.2d<br>
+Â Â trn2Â Â Â Â Â Â v5.2d, v0.2d, v16.2d<br>
+  add       v0.8h, v4.8h, v5.8h<br>
+  mov       v4.d[0], v0.d[1]<br>
+  uaddlv     s0, v0.4h<br>
+  uaddlv     s4, v4.4h<br>
+<br>
+  ushr      v20.2s, v20.2s, #2<br>
+  mov       v0.s[1], v4.s[0]<br>
+  sub       v0.2s, v0.2s, v20.2s<br>
+  mov       w0, v0.s[0]<br>
+  mov       w1, v0.s[1]<br>
+  subs      w0, w0, w1<br>
+  cneg      w0, w0, mi<br>
+<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)<br>
+function x265_quant_neon<br>
+  mov       w9, #1<br>
+  lsl       w9, w9, w4<br>
+  dup       v0.2s, w9<br>
+  neg       w9, w4<br>
+  dup       v1.4s, w9<br>
+  add       w9, w9, #8<br>
+  dup       v2.4s, w9<br>
+  dup       v3.4s, w5<br>
+<br>
+  lsr       w6, w6, #2<br>
+  eor       v4.16b, v4.16b, v4.16b<br>
+  eor       w10, w10, w10<br>
+  eor       v17.16b, v17.16b, v17.16b<br>
+<br>
+.loop_quant:<br>
+<br>
+Â Â ld1Â Â Â Â Â Â Â {v18.4h}, [x0], #8<br>
+Â Â ld1Â Â Â Â Â Â Â {v7.4s}, [x1], #16<br>
+  sxtl      v6.4s, v18.4h<br>
+<br>
+  cmlt      v5.4s, v6.4s, #0<br>
+<br>
+  abs       v6.4s, v6.4s<br>
+<br>
+<br>
+  mul       v6.4s, v6.4s, v7.4s<br>
+<br>
+  add       v7.4s, v6.4s, v3.4s<br>
+  sshl      v7.4s, v7.4s, v1.4s<br>
+<br>
+  mls       v6.4s, v7.4s, v0.s[0]<br>
+  sshl      v16.4s, v6.4s, v2.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v16.4s}, [x2], #16<br>
+<br>
+Â Â // numsig<br>
+  cmeq      v16.4s, v7.4s, v17.4s<br>
+  add       v4.4s, v4.4s, v16.4s<br>
+  add       w10, w10, #4<br>
+<br>
+Â Â // level *= sign<br>
+  eor       v16.16b, v7.16b, v5.16b<br>
+  sub       v16.4s, v16.4s, v5.4s<br>
+  sqxtn      v5.4h, v16.4s<br>
+Â Â st1Â Â Â Â Â Â Â {v5.4h}, [x3], #8<br>
+<br>
+  subs      w6, w6, #1<br>
+Â Â <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>Â Â Â Â Â Â Â .loop_quant<br>
+<br>
+  addv      s4, v4.4s<br>
+  mov       w9, v4.s[0]<br>
+  add       w0, w10, w9<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+.macro satd_4x4_neon<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.s}[0], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.s}[0], [x0], x1<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.s}[0], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.s}[0], [x0], x1<br>
+<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.s}[1], [x0], x1<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.s}[1], [x2], x3<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.s}[1], [x0], x1<br>
+<br>
+  usubl      v4.8h, v0.8b, v1.8b<br>
+  usubl      v5.8h, v2.8b, v3.8b<br>
+<br>
+  add       v6.8h, v4.8h, v5.8h<br>
+  sub       v7.8h, v4.8h, v5.8h<br>
+<br>
+  mov       v4.d[0], v6.d[1]<br>
+  add       v0.8h, v6.8h, v4.8h<br>
+  sub       v2.8h, v6.8h, v4.8h<br>
+<br>
+  mov       v5.d[0], v7.d[1]<br>
+  add       v1.8h, v7.8h, v5.8h<br>
+  sub       v3.8h, v7.8h, v5.8h<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v4.4h, v0.4h, v1.4h<br>
+Â Â trn2Â Â Â Â Â Â v5.4h, v0.4h, v1.4h<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v6.4h, v2.4h, v3.4h<br>
+Â Â trn2Â Â Â Â Â Â v7.4h, v2.4h, v3.4h<br>
+<br>
+  add       v0.4h, v4.4h, v5.4h<br>
+  sub       v1.4h, v4.4h, v5.4h<br>
+<br>
+  add       v2.4h, v6.4h, v7.4h<br>
+  sub       v3.4h, v6.4h, v7.4h<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v4.2s, v0.2s, v1.2s<br>
+Â Â trn2Â Â Â Â Â Â v5.2s, v0.2s, v1.2s<br>
+<br>
+Â Â trn1Â Â Â Â Â Â v6.2s, v2.2s, v3.2s<br>
+Â Â trn2Â Â Â Â Â Â v7.2s, v2.2s, v3.2s<br>
+<br>
+  abs       v4.4h, v4.4h<br>
+  abs       v5.4h, v5.4h<br>
+  abs       v6.4h, v6.4h<br>
+  abs       v7.4h, v7.4h<br>
+<br>
+  smax      v1.4h, v4.4h, v5.4h<br>
+  smax      v2.4h, v6.4h, v7.4h<br>
+<br>
+  add       v0.4h, v1.4h, v2.4h<br>
+  uaddlp     v0.2s, v0.4h<br>
+  uaddlp     v0.1d, v0.2s<br>
+.endm<br>
+<br>
+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_4x4_neon<br>
+Â Â satd_4x4_neon<br>
+  umov      x0, v0.d[0]<br>
+Â Â ret<br>
+endfunc<br>
+<br>
+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>
+function x265_pixel_satd_8x4_neon<br>
+  mov       x4, x0<br>
+  mov       x5, x2<br>
+Â Â satd_4x4_neon<br>
+  add       x0, x4, #4<br>
+  add       x2, x5, #4<br>
+  umov      x6, v0.d[0]<br>
+Â Â satd_4x4_neon<br>
+  umov      x0, v0.d[0]<br>
+  add       x0, x0, x6<br>
+Â Â ret<br>
+endfunc<br>
diff --git a/source/common/aarch64/pixel-util.h b/source/common/aarch64/pixel-util.h<br>
new file mode 100644<br>
index 000000000..043488468<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel-util.h<br>
@@ -0,0 +1,40 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
+ *Â Â Â Â Â Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_PIXEL_UTIL_AARCH64_H<br>
+#define X265_PIXEL_UTIL_AARCH64_H<br>
+<br>
+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>
+<br>
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br>
+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>
+<br>
+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H<br>
diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h<br>
new file mode 100644<br>
index 000000000..179c2f4ec<br>
--- /dev/null<br>
+++ b/source/common/aarch64/pixel.h<br>
@@ -0,0 +1,105 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#ifndef X265_I386_PIXEL_AARCH64_H<br>
+#define X265_I386_PIXEL_AARCH64_H<br>
+<br>
+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>
+<br>
+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>
+<br>
+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>
+<br>
+#endif // ifndef X265_I386_PIXEL_AARCH64_H<br>
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S<br>
new file mode 100644<br>
index 000000000..c27cce5ce<br>
--- /dev/null<br>
+++ b/source/common/aarch64/sad-a.S<br>
@@ -0,0 +1,105 @@<br>
+/*****************************************************************************<br>
+ * Copyright (C) 2020 MulticoreWare, Inc<br>
+ *<br>
+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *<br>
+ * This program is free software; you can redistribute it and/or modify<br>
+ * it under the terms of the GNU General Public License as published by<br>
+ * the Free Software Foundation; either version 2 of the License, or<br>
+ * (at your option) any later version.<br>
+ *<br>
+ * This program is distributed in the hope that it will be useful,<br>
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>
+ * GNU General Public License for more details.<br>
+ *<br>
+ * You should have received a copy of the GNU General Public License<br>
+ * along with this program; if not, write to the Free Software<br>
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>
+ *<br>
+ * This program is also available under a commercial proprietary license.<br>
+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>
+ *****************************************************************************/<br>
+<br>
+#include "asm.S"<br>
+<br>
+.section .rodata<br>
+<br>
+.align 4<br>
+<br>
+.text<br>
+<br>
+.macro SAD_X_START_8 x<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.8b}, [x0], x9<br>
+.if \x == 3<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.8b}, [x1], x4<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.8b}, [x2], x4<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.8b}, [x3], x4<br>
+.elseif \x == 4<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.8b}, [x1], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.8b}, [x2], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.8b}, [x3], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v4.8b}, [x4], x5<br>
+.endif<br>
+  uabdl      v16.8h, v0.8b, v1.8b<br>
+  uabdl      v17.8h, v0.8b, v2.8b<br>
+  uabdl      v18.8h, v0.8b, v3.8b<br>
+.if \x == 4<br>
+  uabdl      v19.8h, v0.8b, v4.8b<br>
+.endif<br>
+.endm<br>
+<br>
+.macro SAD_X_8 x<br>
+Â Â ld1Â Â Â Â Â Â Â {v0.8b}, [x0], x9<br>
+.if \x == 3<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.8b}, [x1], x4<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.8b}, [x2], x4<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.8b}, [x3], x4<br>
+.elseif \x == 4<br>
+Â Â ld1Â Â Â Â Â Â Â {v1.8b}, [x1], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v2.8b}, [x2], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v3.8b}, [x3], x5<br>
+Â Â ld1Â Â Â Â Â Â Â {v4.8b}, [x4], x5<br>
+.endif<br>
+  uabal      v16.8h, v0.8b, v1.8b<br>
+  uabal      v17.8h, v0.8b, v2.8b<br>
+  uabal      v18.8h, v0.8b, v3.8b<br>
+.if \x == 4<br>
+  uabal      v19.8h, v0.8b, v4.8b<br>
+.endif<br>
+.endm<br>
+<br>
+.macro SAD_X_8xN x, h<br>
+function x265_sad_x\x\()_8x\h\()_neon<br>
+  mov       x9, #FENC_STRIDE<br>
+Â Â SAD_X_START_8 \x<br>
+.rept \h - 1<br>
+Â Â SAD_X_8 \x<br>
+.endr<br>
+  uaddlv     s0, v16.8h<br>
+  uaddlv     s1, v17.8h<br>
+  uaddlv     s2, v18.8h<br>
+.if \x == 4<br>
+  uaddlv     s3, v19.8h<br>
+.endif<br>
+<br>
+.if \x == 3<br>
+  stp       s0, s1, [x5]<br>
+  str       s2, [x5, #8]<br>
+.elseif \x == 4<br>
+  stp       s0, s1, [x6]<br>
+  stp       s2, s3, [x6, #8]<br>
+.endif<br>
+Â Â ret<br>
+endfunc<br>
+.endm<br>
+<br>
+SAD_X_8xN 3 4<br>
+SAD_X_8xN 3 8<br>
+SAD_X_8xN 3 16<br>
+SAD_X_8xN 3 32<br>
+<br>
+SAD_X_8xN 4 4<br>
+SAD_X_8xN 4 8<br>
+SAD_X_8xN 4 16<br>
+SAD_X_8xN 4 32<br>
-- <br>
2.21.0.windows.1<br>
<br>
</div></div>