<div dir="ltr"><div class="gmail_quote">From: wangxiyuan <<a href="mailto:wangxiyuan@huawei.com" target="_blank">wangxiyuan@huawei.com</a>><br>

<br>

This patch add some common assembly optimization function for aarch64<br>

platform. These function won't work until the patch Part 2 is merged.<br>

---<br>

 source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++<br>

 source/common/aarch64/asm.S              |  69 ++++<br>

 source/common/aarch64/ipfilter8.S        | 414 ++++++++++++++++++++++<br>

 source/common/aarch64/ipfilter8.h        |  55 +++<br>

 source/common/aarch64/mc-a.S             |  63 ++++<br>

 source/common/aarch64/pixel-util.S       | 419 +++++++++++++++++++++++<br>

 source/common/aarch64/pixel-util.h       |  40 +++<br>

 source/common/aarch64/pixel.h            | 105 ++++++<br>

 source/common/aarch64/sad-a.S            | 105 ++++++<br>

 9 files changed, 1489 insertions(+)<br>

 create mode 100644 source/common/aarch64/asm-primitives.cpp<br>

 create mode 100644 source/common/aarch64/asm.S<br>

 create mode 100644 source/common/aarch64/ipfilter8.S<br>

 create mode 100644 source/common/aarch64/ipfilter8.h<br>

 create mode 100644 source/common/aarch64/mc-a.S<br>

 create mode 100644 source/common/aarch64/pixel-util.S<br>

 create mode 100644 source/common/aarch64/pixel-util.h<br>

 create mode 100644 source/common/aarch64/pixel.h<br>

 create mode 100644 source/common/aarch64/sad-a.S<br>

<br>

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp<br>

new file mode 100644<br>

index 000000000..6fe8c968c<br>

--- /dev/null<br>

+++ b/source/common/aarch64/asm-primitives.cpp<br>

@@ -0,0 +1,219 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *          Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#include "common.h"<br>

+#include "primitives.h"<br>

+#include "x265.h"<br>

+#include "cpu.h"<br>

+<br>

+<br>

+#if defined(__GNUC__)<br>

+#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)<br>

+#endif<br>

+<br>

+#define GCC_4_9_0 40900<br>

+#define GCC_5_1_0 50100<br>

+<br>

+extern "C" {<br>

+#include "pixel.h"<br>

+#include "pixel-util.h"<br>

+#include "ipfilter8.h"<br>

+}<br>

+<br>

+namespace X265_NS {<br>

+// private x265 namespace<br>

+<br>

+<br>

+template<int size><br>

+void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY)<br>

+{<br>

+    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA - 1)]);<br>

+    const int halfFilterSize = NTAPS_LUMA >> 1;<br>

+    const int immedStride = MAX_CU_SIZE;<br>

+<br>

+    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride, idxX, 1);<br>

+    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) * immedStride, immedStride, dst, dstStride, idxY);<br>

+}<br>

+<br>

+<br>

+/* Temporary workaround because luma_vsp assembly primitive has not been completed<br>

+ * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.<br>

+ * Otherwise, segment fault occurs. */<br>

+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask)<br>

+{<br>

+    if (cpuMask & X265_CPU_NEON)<br>

+    {<br>

+        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;<br>

+        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;<br>

+        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;<br>

+        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;<br>

+        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;<br>

+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>

+        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;<br>

+        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;<br>

+        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;<br>

+        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;<br>

+        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;<br>

+        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;<br>

+        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;<br>

+        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;<br>

+        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;<br>

+        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;<br>

+        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;<br>

+        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;<br>

+        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;<br>

+        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;<br>

+        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;    <br>

+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>

+        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;<br>

+        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;<br>

+        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;<br>

+        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;<br>

+        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;<br>

+#endif<br>

+#endif<br>

+    }<br>

+}<br>

+<br>

+<br>

+void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) <br>

+{<br>

+    if (cpuMask & X265_CPU_NEON)<br>

+    {<br>

+        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);<br>

+        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);<br>

+        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);<br>

+        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);<br>

+        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);<br>

+        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);<br>

+        <br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    = PFX(pixel_satd_4x4_neon);<br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    = PFX(pixel_satd_4x8_neon);<br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   = PFX(pixel_satd_4x16_neon);<br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    = PFX(pixel_satd_8x4_neon);<br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    = PFX(pixel_satd_8x8_neon);<br>

+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  = PFX(pixel_satd_12x16_neon);<br>

+        <br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    = PFX(pixel_satd_4x4_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    = PFX(pixel_satd_4x8_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   = PFX(pixel_satd_4x16_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   = PFX(pixel_satd_4x32_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    = PFX(pixel_satd_8x4_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    = PFX(pixel_satd_8x8_neon);<br>

+        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  = PFX(pixel_satd_12x32_neon);<br>

+<br>

+        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x4_neon);<br>

+        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_4x8_neon);<br>

+        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_4x16_neon);<br>

+        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x4_neon);<br>

+        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   = PFX(pixel_avg_pp_8x8_neon);<br>

+        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x16_neon);<br>

+        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  = PFX(pixel_avg_pp_8x32_neon);<br>

+<br>

+        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x4_neon);<br>

+        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_4x8_neon);<br>

+        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_4x16_neon);<br>

+        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x4_neon);<br>

+        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   = PFX(pixel_avg_pp_8x8_neon);<br>

+        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x16_neon);<br>

+        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  = PFX(pixel_avg_pp_8x32_neon);<br>

+<br>

+        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);<br>

+        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);<br>

+        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);<br>

+        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);<br>

+<br>

+        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);<br>

+        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);<br>

+        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);<br>

+        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);<br>

+<br>

+        // quant<br>

+        p.quant = PFX(quant_neon);<br>

+        // luma_hps<br>

+        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);<br>

+        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);<br>

+        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);<br>

+        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);<br>

+        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);<br>

+        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);<br>

+        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);<br>

+        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);<br>

+        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);<br>

+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>

+        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);<br>

+        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);<br>

+        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);<br>

+        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);<br>

+        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);<br>

+        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);<br>

+        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);<br>

+        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);<br>

+        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);<br>

+        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);<br>

+        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);<br>

+        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);<br>

+        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);<br>

+        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);<br>

+        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);<br>

+        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);<br>

+#endif<br>

+<br>

+        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;<br>

+        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;<br>

+        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;<br>

+        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;<br>

+        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;<br>

+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0 */<br>

+        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;<br>

+        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;<br>

+        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;<br>

+        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;<br>

+        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;<br>

+        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;<br>

+        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;<br>

+        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;<br>

+        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;<br>

+        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;<br>

+        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;<br>

+        p.pu[LUMA_64x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;<br>

+        p.pu[LUMA_64x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;<br>

+        p.pu[LUMA_64x48].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;<br>

+        p.pu[LUMA_64x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;<br>

+#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0 */<br>

+        p.pu[LUMA_4x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;<br>

+        p.pu[LUMA_4x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;<br>

+        p.pu[LUMA_4x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;<br>

+        p.pu[LUMA_24x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;<br>

+        p.pu[LUMA_32x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;<br>

+#endif<br>

+#endif<br>

+<br>

+#if !HIGH_BIT_DEPTH<br>

+        <a href="http://p.cu" rel="noreferrer" target="_blank">p.cu</a>[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);<br>

+#endif // !HIGH_BIT_DEPTH<br>

+<br>

+    }<br>

+}<br>

+} // namespace X265_NS<br>

diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S<br>

new file mode 100644<br>

index 000000000..5f020a11a<br>

--- /dev/null<br>

+++ b/source/common/aarch64/asm.S<br>

@@ -0,0 +1,69 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+.arch           armv8-a<br>

+<br>

+#ifdef PREFIX<br>

+#define EXTERN_ASM _<br>

+#else<br>

+#define EXTERN_ASM<br>

+#endif<br>

+<br>

+#ifdef __ELF__<br>

+#define ELF<br>

+#else<br>

+#define ELF @<br>

+#endif<br>

+<br>

+#define HAVE_AS_FUNC 1<br>

+<br>

+#if HAVE_AS_FUNC<br>

+#define FUNC<br>

+#else<br>

+#define FUNC @<br>

+#endif<br>

+<br>

+.macro function name, export=1<br>

+    .macro endfunc<br>

+ELF     .size   \name, . - \name<br>

+FUNC    .endfunc<br>

+        .purgem endfunc<br>

+    .endm<br>

+        .align  2<br>

+.if \export == 1<br>

+        .global EXTERN_ASM\name<br>

+ELF     .hidden EXTERN_ASM\name<br>

+ELF     .type   EXTERN_ASM\name, %function<br>

+FUNC    .func   EXTERN_ASM\name<br>

+EXTERN_ASM\name:<br>

+.else<br>

+ELF     .hidden \name<br>

+ELF     .type   \name, %function<br>

+FUNC    .func   \name<br>

+\name:<br>

+.endif<br>

+.endm<br>

+<br>

+<br>

+#define FENC_STRIDE 64<br>

+#define FDEC_STRIDE 32<br>

diff --git a/source/common/aarch64/ipfilter8.S b/source/common/aarch64/ipfilter8.S<br>

new file mode 100644<br>

index 000000000..908c7db46<br>

--- /dev/null<br>

+++ b/source/common/aarch64/ipfilter8.S<br>

@@ -0,0 +1,414 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#include "asm.S"<br>

+<br>

+.section .rodata<br>

+<br>

+.align 4<br>

+<br>

+.text<br>

+<br>

+<br>

+<br>

+.macro qpel_filter_0_32b<br>

+    movi            v24.8h, #64<br>

+    uxtl            v19.8h, v5.8b<br>

+    smull           v17.4s, v19.4h, v24.4h<br>

+    smull2          v18.4s, v19.8h, v24.8h<br>

+.endm<br>

+<br>

+.macro qpel_filter_1_32b<br>

+    movi            v16.8h, #58<br>

+    uxtl            v19.8h, v5.8b<br>

+    smull           v17.4s, v19.4h, v16.4h<br>

+    smull2          v18.4s, v19.8h, v16.8h<br>

+<br>

+    movi            v24.8h, #10<br>

+    uxtl            v21.8h, v1.8b<br>

+    smull           v19.4s, v21.4h, v24.4h<br>

+    smull2          v20.4s, v21.8h, v24.8h<br>

+<br>

+    movi            v16.8h, #17<br>

+    uxtl            v23.8h, v2.8b<br>

+    smull           v21.4s, v23.4h, v16.4h<br>

+    smull2          v22.4s, v23.8h, v16.8h<br>

+<br>

+    movi            v24.8h, #5<br>

+    uxtl            v1.8h, v6.8b<br>

+    smull           v23.4s, v1.4h, v24.4h<br>

+    smull2          v16.4s, v1.8h, v24.8h<br>

+<br>

+    sub             v17.4s, v17.4s, v19.4s<br>

+    sub             v18.4s, v18.4s, v20.4s<br>

+<br>

+    uxtl            v1.8h, v4.8b<br>

+    sshll           v19.4s, v1.4h, #2<br>

+    sshll2          v20.4s, v1.8h, #2<br>

+<br>

+    add             v17.4s, v17.4s, v21.4s<br>

+    add             v18.4s, v18.4s, v22.4s<br>

+<br>

+    uxtl            v1.8h, v0.8b<br>

+    uxtl            v2.8h, v3.8b<br>

+    ssubl           v21.4s, v2.4h, v1.4h<br>

+    ssubl2          v22.4s, v2.8h, v1.8h<br>

+<br>

+    add             v17.4s, v17.4s, v19.4s<br>

+    add             v18.4s, v18.4s, v20.4s<br>

+    sub             v21.4s, v21.4s, v23.4s<br>

+    sub             v22.4s, v22.4s, v16.4s<br>

+    add             v17.4s, v17.4s, v21.4s<br>

+    add             v18.4s, v18.4s, v22.4s<br>

+.endm<br>

+<br>

+.macro qpel_filter_2_32b<br>

+    movi            v16.4s, #11<br>

+    uxtl            v19.8h, v5.8b<br>

+    uxtl            v20.8h, v2.8b<br>

+    saddl           v17.4s, v19.4h, v20.4h<br>

+    saddl2          v18.4s, v19.8h, v20.8h<br>

+<br>

+    uxtl            v21.8h, v1.8b<br>

+    uxtl            v22.8h, v6.8b<br>

+    saddl           v19.4s, v21.4h, v22.4h<br>

+    saddl2          v20.4s, v21.8h, v22.8h<br>

+<br>

+    mul             v19.4s, v19.4s, v16.4s<br>

+    mul             v20.4s, v20.4s, v16.4s<br>

+<br>

+    movi            v16.4s, #40<br>

+    mul             v17.4s, v17.4s, v16.4s<br>

+    mul             v18.4s, v18.4s, v16.4s<br>

+<br>

+    uxtl            v21.8h, v4.8b<br>

+    uxtl            v22.8h, v3.8b<br>

+    saddl           v23.4s, v21.4h, v22.4h<br>

+    saddl2          v16.4s, v21.8h, v22.8h<br>

+<br>

+    uxtl            v1.8h, v0.8b<br>

+    uxtl            v2.8h, v7.8b<br>

+    saddl           v21.4s, v1.4h, v2.4h<br>

+    saddl2          v22.4s, v1.8h, v2.8h<br>

+<br>

+    shl             v23.4s, v23.4s, #2<br>

+    shl             v16.4s, v16.4s, #2<br>

+<br>

+    add             v19.4s, v19.4s, v21.4s<br>

+    add             v20.4s, v20.4s, v22.4s<br>

+    add             v17.4s, v17.4s, v23.4s<br>

+    add             v18.4s, v18.4s, v16.4s<br>

+    sub             v17.4s, v17.4s, v19.4s<br>

+    sub             v18.4s, v18.4s, v20.4s<br>

+.endm<br>

+<br>

+.macro qpel_filter_3_32b<br>

+    movi            v16.8h, #17<br>

+    movi            v24.8h, #5<br>

+<br>

+    uxtl            v19.8h, v5.8b<br>

+    smull           v17.4s, v19.4h, v16.4h<br>

+    smull2          v18.4s, v19.8h, v16.8h<br>

+<br>

+    uxtl            v21.8h, v1.8b<br>

+    smull           v19.4s, v21.4h, v24.4h<br>

+    smull2          v20.4s, v21.8h, v24.8h<br>

+<br>

+    movi            v16.8h, #58<br>

+    uxtl            v23.8h, v2.8b<br>

+    smull           v21.4s, v23.4h, v16.4h<br>

+    smull2          v22.4s, v23.8h, v16.8h<br>

+<br>

+    movi            v24.8h, #10<br>

+    uxtl            v1.8h, v6.8b<br>

+    smull           v23.4s, v1.4h, v24.4h<br>

+    smull2          v16.4s, v1.8h, v24.8h<br>

+<br>

+    sub             v17.4s, v17.4s, v19.4s<br>

+    sub             v18.4s, v18.4s, v20.4s<br>

+<br>

+    uxtl            v1.8h, v3.8b<br>

+    sshll           v19.4s, v1.4h, #2<br>

+    sshll2          v20.4s, v1.8h, #2<br>

+<br>

+    add             v17.4s, v17.4s, v21.4s<br>

+    add             v18.4s, v18.4s, v22.4s<br>

+<br>

+    uxtl            v1.8h, v4.8b<br>

+    uxtl            v2.8h, v7.8b<br>

+    ssubl           v21.4s, v1.4h, v2.4h<br>

+    ssubl2          v22.4s, v1.8h, v2.8h<br>

+<br>

+    add             v17.4s, v17.4s, v19.4s<br>

+    add             v18.4s, v18.4s, v20.4s<br>

+    sub             v21.4s, v21.4s, v23.4s<br>

+    sub             v22.4s, v22.4s, v16.4s<br>

+    add             v17.4s, v17.4s, v21.4s<br>

+    add             v18.4s, v18.4s, v22.4s<br>

+.endm<br>

+<br>

+<br>

+<br>

+<br>

+.macro vextin8<br>

+    ld1             {v3.16b}, [x11], #16<br>

+    mov             v7.d[0], v3.d[1]<br>

+    ext             v0.8b, v3.8b, v7.8b, #1<br>

+    ext             v4.8b, v3.8b, v7.8b, #2<br>

+    ext             v1.8b, v3.8b, v7.8b, #3<br>

+    ext             v5.8b, v3.8b, v7.8b, #4<br>

+    ext             v2.8b, v3.8b, v7.8b, #5<br>

+    ext             v6.8b, v3.8b, v7.8b, #6<br>

+    ext             v3.8b, v3.8b, v7.8b, #7<br>

+.endm<br>

+<br>

+<br>

+<br>

+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>

+.macro HPS_FILTER a b filterhps<br>

+    mov             w12, #8192<br>

+    mov             w6, w10<br>

+    sub             x3, x3, #\a<br>

+    lsl             x3, x3, #1<br>

+    mov             w9, #\a<br>

+    cmp             w9, #4<br>

+    b.eq            14f<br>

+    cmp             w9, #12<br>

+    b.eq            15f<br>

+    b               7f<br>

+14:<br>

+    HPS_FILTER_4 \a \b \filterhps<br>

+    b               10f<br>

+15:<br>

+    HPS_FILTER_12 \a \b \filterhps<br>

+    b               10f<br>

+7:<br>

+    cmp             w5, #0<br>

+    b.eq            8f<br>

+    cmp             w5, #1<br>

+    b.eq            9f<br>

+8:<br>

+loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>

+    mov             w7, #\a<br>

+    lsr             w7, w7, #3<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    sub             v18.4s, v18.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    xtn2            v0.8h, v18.4s<br>

+    st1             {v0.8h}, [x2], #16<br>

+    subs            w7, w7, #1<br>

+    sub             x11, x11, #8<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    add             x2, x2, x3<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0<br>

+    b               10f<br>

+9:<br>

+loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>

+    mov             w7, #\a<br>

+    lsr             w7, w7, #3<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    sub             v18.4s, v18.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    xtn2            v0.8h, v18.4s<br>

+    st1             {v0.8h}, [x2], #16<br>

+    subs            w7, w7, #1<br>

+    sub             x11, x11, #8<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    add             x2, x2, x3<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1<br>

+10:<br>

+.endm<br>

+<br>

+.macro HPS_FILTER_4 w h filterhps<br>

+    cmp             w5, #0<br>

+    b.eq            11f<br>

+    cmp             w5, #1<br>

+    b.eq            12f<br>

+11:<br>

+loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    st1             {v0.4h}, [x2], #8<br>

+    sub             x11, x11, #8<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    add             x2, x2, x3<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0<br>

+    b               13f<br>

+12:<br>

+loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    st1             {v0.4h}, [x2], #8<br>

+    sub             x11, x11, #8<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    add             x2, x2, x3<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1<br>

+13:<br>

+.endm<br>

+<br>

+.macro HPS_FILTER_12 w h filterhps<br>

+    cmp             w5, #0<br>

+    b.eq            14f<br>

+    cmp             w5, #1<br>

+    b.eq            15f<br>

+14:<br>

+loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    sub             v18.4s, v18.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    xtn2            v0.8h, v18.4s<br>

+    st1             {v0.8h}, [x2], #16<br>

+    sub             x11, x11, #8<br>

+<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    st1             {v0.4h}, [x2], #8<br>

+    add             x2, x2, x3<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0<br>

+    b               16f<br>

+15:<br>

+loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:<br>

+    mov             x11, x0<br>

+    sub             x11, x11, #4<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    sub             v18.4s, v18.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    xtn2            v0.8h, v18.4s<br>

+    st1             {v0.8h}, [x2], #16<br>

+    sub             x11, x11, #8<br>

+<br>

+    vextin8<br>

+    \filterhps<br>

+    dup             v16.4s, w12<br>

+    sub             v17.4s, v17.4s, v16.4s<br>

+    xtn             v0.4h, v17.4s<br>

+    st1             {v0.4h}, [x2], #8<br>

+    add             x2, x2, x3<br>

+    subs            w6, w6, #1<br>

+    add             x0, x0, x1<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1<br>

+16:<br>

+.endm<br>

+<br>

+// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)<br>

+.macro LUMA_HPS w h<br>

+function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon<br>

+    mov             w10, #\h<br>

+    cmp             w5, #0<br>

+    b.eq            6f<br>

+    sub             x0, x0, x1, lsl #2<br>

+<br>

+    add             x0, x0, x1<br>

+    add             w10, w10, #7<br>

+6:<br>

+    cmp             w4, #0<br>

+    b.eq            0f<br>

+    cmp             w4, #1<br>

+    b.eq            1f<br>

+    cmp             w4, #2<br>

+    b.eq            2f<br>

+    cmp             w4, #3<br>

+    b.eq            3f<br>

+0:<br>

+    HPS_FILTER  \w \h qpel_filter_0_32b<br>

+    b               5f<br>

+1:<br>

+    HPS_FILTER  \w \h qpel_filter_1_32b<br>

+    b               5f<br>

+2:<br>

+    HPS_FILTER  \w \h qpel_filter_2_32b<br>

+    b               5f<br>

+3:<br>

+    HPS_FILTER  \w \h qpel_filter_3_32b<br>

+    b               5f<br>

+5:<br>

+    ret<br>

+endfunc<br>

+.endm<br>

+<br>

+LUMA_HPS    4 4<br>

+LUMA_HPS    4 8<br>

+LUMA_HPS    4 16<br>

+LUMA_HPS    8 4<br>

+LUMA_HPS    8 8<br>

+LUMA_HPS    8 16<br>

+LUMA_HPS    8 32<br>

+LUMA_HPS    12 16<br>

+LUMA_HPS    16 4<br>

+LUMA_HPS    16 8<br>

+LUMA_HPS    16 12<br>

+LUMA_HPS    16 16<br>

+LUMA_HPS    16 32<br>

+LUMA_HPS    16 64<br>

+LUMA_HPS    24 32<br>

+LUMA_HPS    32 8<br>

+LUMA_HPS    32 16<br>

+LUMA_HPS    32 24<br>

+LUMA_HPS    32 32<br>

+LUMA_HPS    32 64<br>

+LUMA_HPS    48 64<br>

+LUMA_HPS    64 16<br>

+LUMA_HPS    64 32<br>

+LUMA_HPS    64 48<br>

+LUMA_HPS    64 64<br>

diff --git a/source/common/aarch64/ipfilter8.h b/source/common/aarch64/ipfilter8.h<br>

new file mode 100644<br>

index 000000000..f9ed91e2e<br>

--- /dev/null<br>

+++ b/source/common/aarch64/ipfilter8.h<br>

@@ -0,0 +1,55 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#ifndef X265_IPFILTER8_AARCH64_H<br>

+#define X265_IPFILTER8_AARCH64_H<br>

+<br>

+<br>

+void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);<br>

+<br>

+<br>

+#endif // ifndef X265_IPFILTER8_AARCH64_H<br>

diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S<br>

new file mode 100644<br>

index 000000000..cbaf9b501<br>

--- /dev/null<br>

+++ b/source/common/aarch64/mc-a.S<br>

@@ -0,0 +1,63 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#include "asm.S"<br>

+<br>

+.section .rodata<br>

+<br>

+.align 4<br>

+<br>

+.text<br>

+<br>

+.macro pixel_avg_pp_4xN_neon h<br>

+function x265_pixel_avg_pp_4x\h\()_neon<br>

+.rept \h<br>

+    ld1             {v0.s}[0], [x2], x3<br>

+    ld1             {v1.s}[0], [x4], x5<br>

+    urhadd          v2.8b, v0.8b, v1.8b<br>

+    st1             {v2.s}[0], [x0], x1<br>

+.endr<br>

+    ret<br>

+endfunc<br>

+.endm<br>

+<br>

+pixel_avg_pp_4xN_neon 4<br>

+pixel_avg_pp_4xN_neon 8<br>

+pixel_avg_pp_4xN_neon 16<br>

+<br>

+.macro pixel_avg_pp_8xN_neon h<br>

+function x265_pixel_avg_pp_8x\h\()_neon<br>

+.rept \h<br>

+    ld1             {v0.8b}, [x2], x3<br>

+    ld1             {v1.8b}, [x4], x5<br>

+    urhadd          v2.8b, v0.8b, v1.8b<br>

+    st1             {v2.8b}, [x0], x1<br>

+.endr<br>

+    ret<br>

+endfunc<br>

+.endm<br>

+<br>

+pixel_avg_pp_8xN_neon 4<br>

+pixel_avg_pp_8xN_neon 8<br>

+pixel_avg_pp_8xN_neon 16<br>

+pixel_avg_pp_8xN_neon 32<br>

diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S<br>

new file mode 100644<br>

index 000000000..a085ebdfa<br>

--- /dev/null<br>

+++ b/source/common/aarch64/pixel-util.S<br>

@@ -0,0 +1,419 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>

+ *          Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#include "asm.S"<br>

+<br>

+.section .rodata<br>

+<br>

+.align 4<br>

+<br>

+.text<br>

+<br>

+.macro x265_satd_4x8_8x4_end_neon<br>

+    add             v0.8h, v4.8h, v6.8h<br>

+    add             v1.8h, v5.8h, v7.8h<br>

+    sub             v2.8h, v4.8h, v6.8h<br>

+    sub             v3.8h, v5.8h, v7.8h<br>

+<br>

+    trn1            v16.8h, v0.8h, v1.8h<br>

+    trn2            v17.8h, v0.8h, v1.8h<br>

+    add             v4.8h, v16.8h, v17.8h<br>

+    trn1            v18.8h, v2.8h, v3.8h<br>

+    trn2            v19.8h, v2.8h, v3.8h<br>

+    sub             v5.8h, v16.8h, v17.8h<br>

+    add             v6.8h, v18.8h, v19.8h<br>

+    sub             v7.8h, v18.8h, v19.8h<br>

+    trn1            v0.4s, v4.4s, v6.4s<br>

+    trn2            v2.4s, v4.4s, v6.4s<br>

+    abs             v0.8h, v0.8h<br>

+    trn1            v1.4s, v5.4s, v7.4s<br>

+    trn2            v3.4s, v5.4s, v7.4s<br>

+    abs             v2.8h, v2.8h<br>

+    abs             v1.8h, v1.8h<br>

+    abs             v3.8h, v3.8h<br>

+    umax            v0.8h, v0.8h, v2.8h<br>

+    umax            v1.8h, v1.8h, v3.8h<br>

+    add             v0.8h, v0.8h, v1.8h<br>

+    uaddlv          s0, v0.8h<br>

+.endm<br>

+<br>

+.macro pixel_satd_4x8_neon<br>

+    ld1r             {v1.2s}, [x2], x3<br>

+    ld1r            {v0.2s}, [x0], x1<br>

+    ld1r            {v3.2s}, [x2], x3<br>

+    ld1r            {v2.2s}, [x0], x1<br>

+    ld1r            {v5.2s}, [x2], x3<br>

+    ld1r            {v4.2s}, [x0], x1<br>

+    ld1r            {v7.2s}, [x2], x3<br>

+    ld1r            {v6.2s}, [x0], x1<br>

+<br>

+    ld1             {v1.s}[1], [x2], x3<br>

+    ld1             {v0.s}[1], [x0], x1<br>

+    usubl           v0.8h, v0.8b, v1.8b<br>

+    ld1             {v3.s}[1], [x2], x3<br>

+    ld1             {v2.s}[1], [x0], x1<br>

+    usubl           v1.8h, v2.8b, v3.8b<br>

+    ld1             {v5.s}[1], [x2], x3<br>

+    ld1             {v4.s}[1], [x0], x1<br>

+    usubl           v2.8h, v4.8b, v5.8b<br>

+    ld1             {v7.s}[1], [x2], x3<br>

+    add             v4.8h, v0.8h, v1.8h<br>

+    sub             v5.8h, v0.8h, v1.8h<br>

+    ld1             {v6.s}[1], [x0], x1<br>

+    usubl           v3.8h, v6.8b, v7.8b<br>

+    add         v6.8h, v2.8h, v3.8h<br>

+    sub         v7.8h, v2.8h, v3.8h<br>

+    x265_satd_4x8_8x4_end_neon<br>

+.endm<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_4x8_neon<br>

+    pixel_satd_4x8_neon<br>

+    mov               w0, v0.s[0]<br>

+    ret<br>

+endfunc<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_4x16_neon<br>

+    eor             w4, w4, w4<br>

+    pixel_satd_4x8_neon<br>

+    mov               w5, v0.s[0]<br>

+    add             w4, w4, w5<br>

+    pixel_satd_4x8_neon<br>

+    mov               w5, v0.s[0]<br>

+    add             w0, w5, w4<br>

+    ret<br>

+endfunc<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_4x32_neon<br>

+    eor             w4, w4, w4<br>

+.rept 4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w5, v0.s[0]<br>

+    add             w4, w4, w5<br>

+.endr<br>

+    mov             w0, w4<br>

+    ret<br>

+endfunc<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_12x16_neon<br>

+    mov             x4, x0<br>

+    mov             x5, x2<br>

+    eor             w7, w7, w7<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+<br>

+    add             x0, x4, #4<br>

+    add             x2, x5, #4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+<br>

+    add             x0, x4, #8<br>

+    add             x2, x5, #8<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w0, w7, w6<br>

+    ret<br>

+endfunc<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_12x32_neon<br>

+    mov             x4, x0<br>

+    mov             x5, x2<br>

+    eor             w7, w7, w7<br>

+.rept 4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+.endr<br>

+<br>

+    add             x0, x4, #4<br>

+    add             x2, x5, #4<br>

+.rept 4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+.endr<br>

+<br>

+    add             x0, x4, #8<br>

+    add             x2, x5, #8<br>

+.rept 4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w6, v0.s[0]<br>

+    add             w7, w7, w6<br>

+.endr<br>

+<br>

+    mov             w0, w7<br>

+    ret<br>

+endfunc<br>

+<br>

+// template<int w, int h><br>

+// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_8x8_neon<br>

+    eor             w4, w4, w4<br>

+    mov             x6, x0<br>

+    mov             x7, x2<br>

+    pixel_satd_4x8_neon<br>

+    mov             w5, v0.s[0]<br>

+    add             w4, w4, w5<br>

+    add             x0, x6, #4<br>

+    add             x2, x7, #4<br>

+    pixel_satd_4x8_neon<br>

+    mov             w5, v0.s[0]<br>

+    add             w0, w4, w5<br>

+    ret<br>

+endfunc<br>

+<br>

+// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride)<br>

+function x265_psyCost_4x4_neon<br>

+    ld1r            {v4.2s}, [x0], x1<br>

+    ld1r            {v5.2s}, [x0], x1<br>

+    ld1             {v4.s}[1], [x0], x1<br>

+    ld1             {v5.s}[1], [x0], x1<br>

+<br>

+    ld1r            {v6.2s}, [x2], x3<br>

+    ld1r            {v7.2s}, [x2], x3<br>

+    ld1             {v6.s}[1], [x2], x3<br>

+    ld1             {v7.s}[1], [x2], x3<br>

+<br>

+    uaddl           v2.8h, v4.8b, v5.8b<br>

+    usubl           v3.8h, v4.8b, v5.8b<br>

+    uaddl           v18.8h, v6.8b, v7.8b<br>

+    usubl           v19.8h, v6.8b, v7.8b<br>

+<br>

+    mov             v20.d[0], v2.d[1]<br>

+    add             v0.4h, v2.4h, v20.4h<br>

+    sub             v1.4h, v2.4h, v20.4h<br>

+    mov             v21.d[0], v3.d[1]<br>

+    add             v22.4h, v3.4h, v21.4h<br>

+    sub             v23.4h, v3.4h, v21.4h<br>

+<br>

+    mov             v24.d[0], v18.d[1]<br>

+    add             v16.4h, v18.4h, v24.4h<br>

+    sub             v17.4h, v18.4h, v24.4h<br>

+    mov             v25.d[0], v19.d[1]<br>

+    add             v26.4h, v19.4h, v25.4h<br>

+    sub             v27.4h, v19.4h, v25.4h<br>

+<br>

+    mov             v0.d[1], v22.d[0]<br>

+    mov             v1.d[1], v23.d[0]<br>

+    trn1            v22.8h, v0.8h, v1.8h<br>

+    trn2            v23.8h, v0.8h, v1.8h<br>

+    mov             v16.d[1], v26.d[0]<br>

+    mov             v17.d[1], v27.d[0]<br>

+    trn1            v26.8h, v16.8h, v17.8h<br>

+    trn2            v27.8h, v16.8h, v17.8h<br>

+<br>

+    add             v2.8h, v22.8h, v23.8h<br>

+    sub             v3.8h, v22.8h, v23.8h<br>

+    add             v18.8h, v26.8h, v27.8h<br>

+    sub             v19.8h, v26.8h, v27.8h<br>

+<br>

+    uaddl           v20.8h, v4.8b, v5.8b<br>

+    uaddl           v21.8h, v6.8b, v7.8b<br>

+<br>

+    trn1            v0.4s, v2.4s, v3.4s<br>

+    trn2            v1.4s, v2.4s, v3.4s<br>

+    trn1            v16.4s, v18.4s, v19.4s<br>

+    trn2            v17.4s, v18.4s, v19.4s<br>

+    abs             v0.8h, v0.8h<br>

+    abs             v16.8h, v16.8h<br>

+    abs             v1.8h, v1.8h<br>

+    abs             v17.8h, v17.8h<br>

+<br>

+    uaddlv          s20, v20.8h<br>

+    uaddlv          s21, v21.8h<br>

+    mov             v20.s[1], v21.s[0]<br>

+<br>

+    smax            v0.8h, v0.8h, v1.8h<br>

+    smax            v16.8h, v16.8h, v17.8h<br>

+<br>

+    trn1            v4.2d, v0.2d, v16.2d<br>

+    trn2            v5.2d, v0.2d, v16.2d<br>

+    add             v0.8h, v4.8h, v5.8h<br>

+    mov             v4.d[0], v0.d[1]<br>

+    uaddlv          s0, v0.4h<br>

+    uaddlv          s4, v4.4h<br>

+<br>

+    ushr            v20.2s, v20.2s, #2<br>

+    mov             v0.s[1], v4.s[0]<br>

+    sub             v0.2s, v0.2s, v20.2s<br>

+    mov             w0, v0.s[0]<br>

+    mov             w1, v0.s[1]<br>

+    subs            w0, w0, w1<br>

+    cneg            w0, w0, mi<br>

+<br>

+    ret<br>

+endfunc<br>

+<br>

+// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)<br>

+function x265_quant_neon<br>

+    mov             w9, #1<br>

+    lsl             w9, w9, w4<br>

+    dup             v0.2s, w9<br>

+    neg             w9, w4<br>

+    dup             v1.4s, w9<br>

+    add             w9, w9, #8<br>

+    dup             v2.4s, w9<br>

+    dup             v3.4s, w5<br>

+<br>

+    lsr             w6, w6, #2<br>

+    eor             v4.16b, v4.16b, v4.16b<br>

+    eor             w10, w10, w10<br>

+    eor             v17.16b, v17.16b, v17.16b<br>

+<br>

+.loop_quant:<br>

+<br>

+    ld1             {v18.4h}, [x0], #8<br>

+    ld1             {v7.4s}, [x1], #16<br>

+    sxtl            v6.4s, v18.4h<br>

+<br>

+    cmlt            v5.4s, v6.4s, #0<br>

+<br>

+    abs             v6.4s, v6.4s<br>

+<br>

+<br>

+    mul             v6.4s, v6.4s, v7.4s<br>

+<br>

+    add             v7.4s, v6.4s, v3.4s<br>

+    sshl            v7.4s, v7.4s, v1.4s<br>

+<br>

+    mls             v6.4s, v7.4s, v0.s[0]<br>

+    sshl            v16.4s, v6.4s, v2.4s<br>

+    st1             {v16.4s}, [x2], #16<br>

+<br>

+    // numsig<br>

+    cmeq            v16.4s, v7.4s, v17.4s<br>

+    add             v4.4s, v4.4s, v16.4s<br>

+    add             w10, w10, #4<br>

+<br>

+    // level *= sign<br>

+    eor             v16.16b, v7.16b, v5.16b<br>

+    sub             v16.4s, v16.4s, v5.4s<br>

+    sqxtn           v5.4h, v16.4s<br>

+    st1             {v5.4h}, [x3], #8<br>

+<br>

+    subs            w6, w6, #1<br>

+    <a href="http://b.ne" rel="noreferrer" target="_blank">b.ne</a>             .loop_quant<br>

+<br>

+    addv            s4, v4.4s<br>

+    mov             w9, v4.s[0]<br>

+    add             w0, w10, w9<br>

+    ret<br>

+endfunc<br>

+<br>

+.macro satd_4x4_neon<br>

+    ld1             {v1.s}[0], [x2], x3<br>

+    ld1             {v0.s}[0], [x0], x1<br>

+    ld1             {v3.s}[0], [x2], x3<br>

+    ld1             {v2.s}[0], [x0], x1<br>

+<br>

+    ld1             {v1.s}[1], [x2], x3<br>

+    ld1             {v0.s}[1], [x0], x1<br>

+    ld1             {v3.s}[1], [x2], x3<br>

+    ld1             {v2.s}[1], [x0], x1<br>

+<br>

+    usubl           v4.8h, v0.8b, v1.8b<br>

+    usubl           v5.8h, v2.8b, v3.8b<br>

+<br>

+    add             v6.8h, v4.8h, v5.8h<br>

+    sub             v7.8h, v4.8h, v5.8h<br>

+<br>

+    mov             v4.d[0], v6.d[1]<br>

+    add             v0.8h, v6.8h, v4.8h<br>

+    sub             v2.8h, v6.8h, v4.8h<br>

+<br>

+    mov             v5.d[0], v7.d[1]<br>

+    add             v1.8h, v7.8h, v5.8h<br>

+    sub             v3.8h, v7.8h, v5.8h<br>

+<br>

+    trn1            v4.4h, v0.4h, v1.4h<br>

+    trn2            v5.4h, v0.4h, v1.4h<br>

+<br>

+    trn1            v6.4h, v2.4h, v3.4h<br>

+    trn2            v7.4h, v2.4h, v3.4h<br>

+<br>

+    add             v0.4h, v4.4h, v5.4h<br>

+    sub             v1.4h, v4.4h, v5.4h<br>

+<br>

+    add             v2.4h, v6.4h, v7.4h<br>

+    sub             v3.4h, v6.4h, v7.4h<br>

+<br>

+    trn1            v4.2s, v0.2s, v1.2s<br>

+    trn2            v5.2s, v0.2s, v1.2s<br>

+<br>

+    trn1            v6.2s, v2.2s, v3.2s<br>

+    trn2            v7.2s, v2.2s, v3.2s<br>

+<br>

+    abs             v4.4h, v4.4h<br>

+    abs             v5.4h, v5.4h<br>

+    abs             v6.4h, v6.4h<br>

+    abs             v7.4h, v7.4h<br>

+<br>

+    smax            v1.4h, v4.4h, v5.4h<br>

+    smax            v2.4h, v6.4h, v7.4h<br>

+<br>

+    add             v0.4h, v1.4h, v2.4h<br>

+    uaddlp          v0.2s, v0.4h<br>

+    uaddlp          v0.1d, v0.2s<br>

+.endm<br>

+<br>

+// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_4x4_neon<br>

+    satd_4x4_neon<br>

+    umov            x0, v0.d[0]<br>

+    ret<br>

+endfunc<br>

+<br>

+// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2)<br>

+function x265_pixel_satd_8x4_neon<br>

+    mov             x4, x0<br>

+    mov             x5, x2<br>

+    satd_4x4_neon<br>

+    add             x0, x4, #4<br>

+    add             x2, x5, #4<br>

+    umov            x6, v0.d[0]<br>

+    satd_4x4_neon<br>

+    umov            x0, v0.d[0]<br>

+    add             x0, x0, x6<br>

+    ret<br>

+endfunc<br>

diff --git a/source/common/aarch64/pixel-util.h b/source/common/aarch64/pixel-util.h<br>

new file mode 100644<br>

index 000000000..043488468<br>

--- /dev/null<br>

+++ b/source/common/aarch64/pixel-util.h<br>

@@ -0,0 +1,40 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>

+ *          Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#ifndef X265_PIXEL_UTIL_AARCH64_H<br>

+#define X265_PIXEL_UTIL_AARCH64_H<br>

+<br>

+int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);<br>

+<br>

+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br>

+int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);<br>

+<br>

+#endif // ifndef X265_PIXEL_UTIL_AARCH64_H<br>

diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h<br>

new file mode 100644<br>

index 000000000..179c2f4ec<br>

--- /dev/null<br>

+++ b/source/common/aarch64/pixel.h<br>

@@ -0,0 +1,105 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#ifndef X265_I386_PIXEL_AARCH64_H<br>

+#define X265_I386_PIXEL_AARCH64_H<br>

+<br>

+void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);<br>

+<br>

+void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);<br>

+<br>

+void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);<br>

+<br>

+#endif // ifndef X265_I386_PIXEL_AARCH64_H<br>

diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S<br>

new file mode 100644<br>

index 000000000..c27cce5ce<br>

--- /dev/null<br>

+++ b/source/common/aarch64/sad-a.S<br>

@@ -0,0 +1,105 @@<br>

+/*****************************************************************************<br>

+ * Copyright (C) 2020 MulticoreWare, Inc<br>

+ *<br>

+ * Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>

+ *<br>

+ * This program is free software; you can redistribute it and/or modify<br>

+ * it under the terms of the GNU General Public License as published by<br>

+ * the Free Software Foundation; either version 2 of the License, or<br>

+ * (at your option) any later version.<br>

+ *<br>

+ * This program is distributed in the hope that it will be useful,<br>

+ * but WITHOUT ANY WARRANTY; without even the implied warranty of<br>

+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<br>

+ * GNU General Public License for more details.<br>

+ *<br>

+ * You should have received a copy of the GNU General Public License<br>

+ * along with this program; if not, write to the Free Software<br>

+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<br>

+ *<br>

+ * This program is also available under a commercial proprietary license.<br>

+ * For more information, contact us at license @ <a href="http://x265.com" rel="noreferrer" target="_blank">x265.com</a>.<br>

+ *****************************************************************************/<br>

+<br>

+#include "asm.S"<br>

+<br>

+.section .rodata<br>

+<br>

+.align 4<br>

+<br>

+.text<br>

+<br>

+.macro SAD_X_START_8 x<br>

+    ld1             {v0.8b}, [x0], x9<br>

+.if \x == 3<br>

+    ld1             {v1.8b}, [x1], x4<br>

+    ld1             {v2.8b}, [x2], x4<br>

+    ld1             {v3.8b}, [x3], x4<br>

+.elseif \x == 4<br>

+    ld1             {v1.8b}, [x1], x5<br>

+    ld1             {v2.8b}, [x2], x5<br>

+    ld1             {v3.8b}, [x3], x5<br>

+    ld1             {v4.8b}, [x4], x5<br>

+.endif<br>

+    uabdl           v16.8h, v0.8b, v1.8b<br>

+    uabdl           v17.8h, v0.8b, v2.8b<br>

+    uabdl           v18.8h, v0.8b, v3.8b<br>

+.if \x == 4<br>

+    uabdl           v19.8h, v0.8b, v4.8b<br>

+.endif<br>

+.endm<br>

+<br>

+.macro SAD_X_8 x<br>

+    ld1             {v0.8b}, [x0], x9<br>

+.if \x == 3<br>

+    ld1             {v1.8b}, [x1], x4<br>

+    ld1             {v2.8b}, [x2], x4<br>

+    ld1             {v3.8b}, [x3], x4<br>

+.elseif \x == 4<br>

+    ld1             {v1.8b}, [x1], x5<br>

+    ld1             {v2.8b}, [x2], x5<br>

+    ld1             {v3.8b}, [x3], x5<br>

+    ld1             {v4.8b}, [x4], x5<br>

+.endif<br>

+    uabal           v16.8h, v0.8b, v1.8b<br>

+    uabal           v17.8h, v0.8b, v2.8b<br>

+    uabal           v18.8h, v0.8b, v3.8b<br>

+.if \x == 4<br>

+    uabal           v19.8h, v0.8b, v4.8b<br>

+.endif<br>

+.endm<br>

+<br>

+.macro SAD_X_8xN x, h<br>

+function x265_sad_x\x\()_8x\h\()_neon<br>

+    mov             x9, #FENC_STRIDE<br>

+    SAD_X_START_8 \x<br>

+.rept \h - 1<br>

+    SAD_X_8 \x<br>

+.endr<br>

+    uaddlv          s0, v16.8h<br>

+    uaddlv          s1, v17.8h<br>

+    uaddlv          s2, v18.8h<br>

+.if \x == 4<br>

+    uaddlv          s3, v19.8h<br>

+.endif<br>

+<br>

+.if \x == 3<br>

+    stp             s0, s1, [x5]<br>

+    str             s2, [x5, #8]<br>

+.elseif \x == 4<br>

+    stp             s0, s1, [x6]<br>

+    stp             s2, s3, [x6, #8]<br>

+.endif<br>

+    ret<br>

+endfunc<br>

+.endm<br>

+<br>

+SAD_X_8xN 3 4<br>

+SAD_X_8xN 3 8<br>

+SAD_X_8xN 3 16<br>

+SAD_X_8xN 3 32<br>

+<br>

+SAD_X_8xN 4 4<br>

+SAD_X_8xN 4 8<br>

+SAD_X_8xN 4 16<br>

+SAD_X_8xN 4 32<br>

-- <br>

2.21.0.windows.1<br>

<br>

</div></div>