[x265] [PATCH] Add aarch64 support - Part 1

Aruna Matheswaran aruna at multicorewareinc.com
Thu Mar 19 09:16:56 CET 2020


Pushed the patch series to default. Thanks!

On Thu, Feb 27, 2020 at 8:02 AM Xiyuan Wang <wangxiyuan1007 at gmail.com>
wrote:

> From: wangxiyuan <wangxiyuan at huawei.com>
>
> This patch add some common assembly optimization function for aarch64
> platform. These function won't work until the patch Part 2 is merged.
> ---
>  source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++
>  source/common/aarch64/asm.S              |  69 ++++
>  source/common/aarch64/ipfilter8.S        | 414 ++++++++++++++++++++++
>  source/common/aarch64/ipfilter8.h        |  55 +++
>  source/common/aarch64/mc-a.S             |  63 ++++
>  source/common/aarch64/pixel-util.S       | 419 +++++++++++++++++++++++
>  source/common/aarch64/pixel-util.h       |  40 +++
>  source/common/aarch64/pixel.h            | 105 ++++++
>  source/common/aarch64/sad-a.S            | 105 ++++++
>  9 files changed, 1489 insertions(+)
>  create mode 100644 source/common/aarch64/asm-primitives.cpp
>  create mode 100644 source/common/aarch64/asm.S
>  create mode 100644 source/common/aarch64/ipfilter8.S
>  create mode 100644 source/common/aarch64/ipfilter8.h
>  create mode 100644 source/common/aarch64/mc-a.S
>  create mode 100644 source/common/aarch64/pixel-util.S
>  create mode 100644 source/common/aarch64/pixel-util.h
>  create mode 100644 source/common/aarch64/pixel.h
>  create mode 100644 source/common/aarch64/sad-a.S
>
> diff --git a/source/common/aarch64/asm-primitives.cpp
> b/source/common/aarch64/asm-primitives.cpp
> new file mode 100644
> index 000000000..6fe8c968c
> --- /dev/null
> +++ b/source/common/aarch64/asm-primitives.cpp
> @@ -0,0 +1,219 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *          Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "common.h"
> +#include "primitives.h"
> +#include "x265.h"
> +#include "cpu.h"
> +
> +
> +#if defined(__GNUC__)
> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +
> __GNUC_PATCHLEVEL__)
> +#endif
> +
> +#define GCC_4_9_0 40900
> +#define GCC_5_1_0 50100
> +
> +extern "C" {
> +#include "pixel.h"
> +#include "pixel-util.h"
> +#include "ipfilter8.h"
> +}
> +
> +namespace X265_NS {
> +// private x265 namespace
> +
> +
> +template<int size>
> +void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int idxX, int idxY)
> +{
> +    ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA -
> 1)]);
> +    const int halfFilterSize = NTAPS_LUMA >> 1;
> +    const int immedStride = MAX_CU_SIZE;
> +
> +    primitives.pu[size].luma_hps(src, srcStride, immed, immedStride,
> idxX, 1);
> +    primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) *
> immedStride, immedStride, dst, dstStride, idxY);
> +}
> +
> +
> +/* Temporary workaround because luma_vsp assembly primitive has not been
> completed
> + * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly
> primitive.
> + * Otherwise, segment fault occurs. */
> +void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives
> &asmp, int cpuMask)
> +{
> +    if (cpuMask & X265_CPU_NEON)
> +    {
> +        asmp.pu[LUMA_8x4].luma_vsp   = cp.pu[LUMA_8x4].luma_vsp;
> +        asmp.pu[LUMA_8x8].luma_vsp   = cp.pu[LUMA_8x8].luma_vsp;
> +        asmp.pu[LUMA_8x16].luma_vsp  = cp.pu[LUMA_8x16].luma_vsp;
> +        asmp.pu[LUMA_8x32].luma_vsp  = cp.pu[LUMA_8x32].luma_vsp;
> +        asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> +        asmp.pu[LUMA_16x4].luma_vsp  = cp.pu[LUMA_16x4].luma_vsp;
> +        asmp.pu[LUMA_16x8].luma_vsp  = cp.pu[LUMA_16x8].luma_vsp;
> +        asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
> +        asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
> +        asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
> +        asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
> +        asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
> +        asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
> +        asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
> +        asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
> +        asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
> +        asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
> +        asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
> +        asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
> +        asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
> */
> +        asmp.pu[LUMA_4x4].luma_vsp   = cp.pu[LUMA_4x4].luma_vsp;
> +        asmp.pu[LUMA_4x8].luma_vsp   = cp.pu[LUMA_4x8].luma_vsp;
> +        asmp.pu[LUMA_4x16].luma_vsp  = cp.pu[LUMA_4x16].luma_vsp;
> +        asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
> +        asmp.pu[LUMA_32x8].luma_vsp  = cp.pu[LUMA_32x8].luma_vsp;
> +#endif
> +#endif
> +    }
> +}
> +
> +
> +void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
> +{
> +    if (cpuMask & X265_CPU_NEON)
> +    {
> +        p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
> +        p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
> +        p.pu[LUMA_4x16].satd  = PFX(pixel_satd_4x16_neon);
> +        p.pu[LUMA_8x4].satd   = PFX(pixel_satd_8x4_neon);
> +        p.pu[LUMA_8x8].satd   = PFX(pixel_satd_8x8_neon);
> +        p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
> +
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd    =
> PFX(pixel_satd_4x4_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd    =
> PFX(pixel_satd_4x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd   =
> PFX(pixel_satd_4x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd    =
> PFX(pixel_satd_8x4_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd    =
> PFX(pixel_satd_8x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd  =
> PFX(pixel_satd_12x16_neon);
> +
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd    =
> PFX(pixel_satd_4x4_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd    =
> PFX(pixel_satd_4x8_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd   =
> PFX(pixel_satd_4x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd   =
> PFX(pixel_satd_4x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd    =
> PFX(pixel_satd_8x4_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd    =
> PFX(pixel_satd_8x8_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd  =
> PFX(pixel_satd_12x32_neon);
> +
> +        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_4x4_neon);
> +        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_4x8_neon);
> +        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_4x16_neon);
> +        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_8x4_neon);
> +        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_8x8_neon);
> +        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_8x16_neon);
> +        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_8x32_neon);
> +
> +        p.pu[LUMA_4x4].pixelavg_pp[ALIGNED]   =
> PFX(pixel_avg_pp_4x4_neon);
> +        p.pu[LUMA_4x8].pixelavg_pp[ALIGNED]   =
> PFX(pixel_avg_pp_4x8_neon);
> +        p.pu[LUMA_4x16].pixelavg_pp[ALIGNED]  =
> PFX(pixel_avg_pp_4x16_neon);
> +        p.pu[LUMA_8x4].pixelavg_pp[ALIGNED]   =
> PFX(pixel_avg_pp_8x4_neon);
> +        p.pu[LUMA_8x8].pixelavg_pp[ALIGNED]   =
> PFX(pixel_avg_pp_8x8_neon);
> +        p.pu[LUMA_8x16].pixelavg_pp[ALIGNED]  =
> PFX(pixel_avg_pp_8x16_neon);
> +        p.pu[LUMA_8x32].pixelavg_pp[ALIGNED]  =
> PFX(pixel_avg_pp_8x32_neon);
> +
> +        p.pu[LUMA_8x4].sad_x3   = PFX(sad_x3_8x4_neon);
> +        p.pu[LUMA_8x8].sad_x3   = PFX(sad_x3_8x8_neon);
> +        p.pu[LUMA_8x16].sad_x3  = PFX(sad_x3_8x16_neon);
> +        p.pu[LUMA_8x32].sad_x3  = PFX(sad_x3_8x32_neon);
> +
> +        p.pu[LUMA_8x4].sad_x4   = PFX(sad_x4_8x4_neon);
> +        p.pu[LUMA_8x8].sad_x4   = PFX(sad_x4_8x8_neon);
> +        p.pu[LUMA_8x16].sad_x4  = PFX(sad_x4_8x16_neon);
> +        p.pu[LUMA_8x32].sad_x4  = PFX(sad_x4_8x32_neon);
> +
> +        // quant
> +        p.quant = PFX(quant_neon);
> +        // luma_hps
> +        p.pu[LUMA_4x4].luma_hps   = PFX(interp_8tap_horiz_ps_4x4_neon);
> +        p.pu[LUMA_4x8].luma_hps   = PFX(interp_8tap_horiz_ps_4x8_neon);
> +        p.pu[LUMA_4x16].luma_hps  = PFX(interp_8tap_horiz_ps_4x16_neon);
> +        p.pu[LUMA_8x4].luma_hps   = PFX(interp_8tap_horiz_ps_8x4_neon);
> +        p.pu[LUMA_8x8].luma_hps   = PFX(interp_8tap_horiz_ps_8x8_neon);
> +        p.pu[LUMA_8x16].luma_hps  = PFX(interp_8tap_horiz_ps_8x16_neon);
> +        p.pu[LUMA_8x32].luma_hps  = PFX(interp_8tap_horiz_ps_8x32_neon);
> +        p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
> +        p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> +        p.pu[LUMA_16x4].luma_hps  = PFX(interp_8tap_horiz_ps_16x4_neon);
> +        p.pu[LUMA_16x8].luma_hps  = PFX(interp_8tap_horiz_ps_16x8_neon);
> +        p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
> +        p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
> +        p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
> +        p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
> +        p.pu[LUMA_32x8].luma_hps  = PFX(interp_8tap_horiz_ps_32x8_neon);
> +        p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
> +        p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
> +        p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
> +        p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
> +        p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
> +        p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
> +        p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
> +        p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
> +        p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
> +#endif
> +
> +        p.pu[LUMA_8x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x4>;
> +        p.pu[LUMA_8x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_8x8>;
> +        p.pu[LUMA_8x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x16>;
> +        p.pu[LUMA_8x32].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_8x32>;
> +        p.pu[LUMA_12x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_12x16>;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> +        p.pu[LUMA_16x4].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x4>;
> +        p.pu[LUMA_16x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_16x8>;
> +        p.pu[LUMA_16x12].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x12>;
> +        p.pu[LUMA_16x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x16>;
> +        p.pu[LUMA_16x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x32>;
> +        p.pu[LUMA_16x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_16x64>;
> +        p.pu[LUMA_32x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x16>;
> +        p.pu[LUMA_32x24].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x24>;
> +        p.pu[LUMA_32x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x32>;
> +        p.pu[LUMA_32x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_32x64>;
> +        p.pu[LUMA_48x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_48x64>;
> +        p.pu[LUMA_64x16].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x16>;
> +        p.pu[LUMA_64x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x32>;
> +        p.pu[LUMA_64x48].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x48>;
> +        p.pu[LUMA_64x64].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_64x64>;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
> */
> +        p.pu[LUMA_4x4].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x4>;
> +        p.pu[LUMA_4x8].luma_hvpp   =  interp_8tap_hv_pp_cpu<LUMA_4x8>;
> +        p.pu[LUMA_4x16].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_4x16>;
> +        p.pu[LUMA_24x32].luma_hvpp =  interp_8tap_hv_pp_cpu<LUMA_24x32>;
> +        p.pu[LUMA_32x8].luma_hvpp  =  interp_8tap_hv_pp_cpu<LUMA_32x8>;
> +#endif
> +#endif
> +
> +#if !HIGH_BIT_DEPTH
> +        p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
> +#endif // !HIGH_BIT_DEPTH
> +
> +    }
> +}
> +} // namespace X265_NS
> diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
> new file mode 100644
> index 000000000..5f020a11a
> --- /dev/null
> +++ b/source/common/aarch64/asm.S
> @@ -0,0 +1,69 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +.arch           armv8-a
> +
> +#ifdef PREFIX
> +#define EXTERN_ASM _
> +#else
> +#define EXTERN_ASM
> +#endif
> +
> +#ifdef __ELF__
> +#define ELF
> +#else
> +#define ELF @
> +#endif
> +
> +#define HAVE_AS_FUNC 1
> +
> +#if HAVE_AS_FUNC
> +#define FUNC
> +#else
> +#define FUNC @
> +#endif
> +
> +.macro function name, export=1
> +    .macro endfunc
> +ELF     .size   \name, . - \name
> +FUNC    .endfunc
> +        .purgem endfunc
> +    .endm
> +        .align  2
> +.if \export == 1
> +        .global EXTERN_ASM\name
> +ELF     .hidden EXTERN_ASM\name
> +ELF     .type   EXTERN_ASM\name, %function
> +FUNC    .func   EXTERN_ASM\name
> +EXTERN_ASM\name:
> +.else
> +ELF     .hidden \name
> +ELF     .type   \name, %function
> +FUNC    .func   \name
> +\name:
> +.endif
> +.endm
> +
> +
> +#define FENC_STRIDE 64
> +#define FDEC_STRIDE 32
> diff --git a/source/common/aarch64/ipfilter8.S
> b/source/common/aarch64/ipfilter8.S
> new file mode 100644
> index 000000000..908c7db46
> --- /dev/null
> +++ b/source/common/aarch64/ipfilter8.S
> @@ -0,0 +1,414 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +
> +
> +.macro qpel_filter_0_32b
> +    movi            v24.8h, #64
> +    uxtl            v19.8h, v5.8b
> +    smull           v17.4s, v19.4h, v24.4h
> +    smull2          v18.4s, v19.8h, v24.8h
> +.endm
> +
> +.macro qpel_filter_1_32b
> +    movi            v16.8h, #58
> +    uxtl            v19.8h, v5.8b
> +    smull           v17.4s, v19.4h, v16.4h
> +    smull2          v18.4s, v19.8h, v16.8h
> +
> +    movi            v24.8h, #10
> +    uxtl            v21.8h, v1.8b
> +    smull           v19.4s, v21.4h, v24.4h
> +    smull2          v20.4s, v21.8h, v24.8h
> +
> +    movi            v16.8h, #17
> +    uxtl            v23.8h, v2.8b
> +    smull           v21.4s, v23.4h, v16.4h
> +    smull2          v22.4s, v23.8h, v16.8h
> +
> +    movi            v24.8h, #5
> +    uxtl            v1.8h, v6.8b
> +    smull           v23.4s, v1.4h, v24.4h
> +    smull2          v16.4s, v1.8h, v24.8h
> +
> +    sub             v17.4s, v17.4s, v19.4s
> +    sub             v18.4s, v18.4s, v20.4s
> +
> +    uxtl            v1.8h, v4.8b
> +    sshll           v19.4s, v1.4h, #2
> +    sshll2          v20.4s, v1.8h, #2
> +
> +    add             v17.4s, v17.4s, v21.4s
> +    add             v18.4s, v18.4s, v22.4s
> +
> +    uxtl            v1.8h, v0.8b
> +    uxtl            v2.8h, v3.8b
> +    ssubl           v21.4s, v2.4h, v1.4h
> +    ssubl2          v22.4s, v2.8h, v1.8h
> +
> +    add             v17.4s, v17.4s, v19.4s
> +    add             v18.4s, v18.4s, v20.4s
> +    sub             v21.4s, v21.4s, v23.4s
> +    sub             v22.4s, v22.4s, v16.4s
> +    add             v17.4s, v17.4s, v21.4s
> +    add             v18.4s, v18.4s, v22.4s
> +.endm
> +
> +.macro qpel_filter_2_32b
> +    movi            v16.4s, #11
> +    uxtl            v19.8h, v5.8b
> +    uxtl            v20.8h, v2.8b
> +    saddl           v17.4s, v19.4h, v20.4h
> +    saddl2          v18.4s, v19.8h, v20.8h
> +
> +    uxtl            v21.8h, v1.8b
> +    uxtl            v22.8h, v6.8b
> +    saddl           v19.4s, v21.4h, v22.4h
> +    saddl2          v20.4s, v21.8h, v22.8h
> +
> +    mul             v19.4s, v19.4s, v16.4s
> +    mul             v20.4s, v20.4s, v16.4s
> +
> +    movi            v16.4s, #40
> +    mul             v17.4s, v17.4s, v16.4s
> +    mul             v18.4s, v18.4s, v16.4s
> +
> +    uxtl            v21.8h, v4.8b
> +    uxtl            v22.8h, v3.8b
> +    saddl           v23.4s, v21.4h, v22.4h
> +    saddl2          v16.4s, v21.8h, v22.8h
> +
> +    uxtl            v1.8h, v0.8b
> +    uxtl            v2.8h, v7.8b
> +    saddl           v21.4s, v1.4h, v2.4h
> +    saddl2          v22.4s, v1.8h, v2.8h
> +
> +    shl             v23.4s, v23.4s, #2
> +    shl             v16.4s, v16.4s, #2
> +
> +    add             v19.4s, v19.4s, v21.4s
> +    add             v20.4s, v20.4s, v22.4s
> +    add             v17.4s, v17.4s, v23.4s
> +    add             v18.4s, v18.4s, v16.4s
> +    sub             v17.4s, v17.4s, v19.4s
> +    sub             v18.4s, v18.4s, v20.4s
> +.endm
> +
> +.macro qpel_filter_3_32b
> +    movi            v16.8h, #17
> +    movi            v24.8h, #5
> +
> +    uxtl            v19.8h, v5.8b
> +    smull           v17.4s, v19.4h, v16.4h
> +    smull2          v18.4s, v19.8h, v16.8h
> +
> +    uxtl            v21.8h, v1.8b
> +    smull           v19.4s, v21.4h, v24.4h
> +    smull2          v20.4s, v21.8h, v24.8h
> +
> +    movi            v16.8h, #58
> +    uxtl            v23.8h, v2.8b
> +    smull           v21.4s, v23.4h, v16.4h
> +    smull2          v22.4s, v23.8h, v16.8h
> +
> +    movi            v24.8h, #10
> +    uxtl            v1.8h, v6.8b
> +    smull           v23.4s, v1.4h, v24.4h
> +    smull2          v16.4s, v1.8h, v24.8h
> +
> +    sub             v17.4s, v17.4s, v19.4s
> +    sub             v18.4s, v18.4s, v20.4s
> +
> +    uxtl            v1.8h, v3.8b
> +    sshll           v19.4s, v1.4h, #2
> +    sshll2          v20.4s, v1.8h, #2
> +
> +    add             v17.4s, v17.4s, v21.4s
> +    add             v18.4s, v18.4s, v22.4s
> +
> +    uxtl            v1.8h, v4.8b
> +    uxtl            v2.8h, v7.8b
> +    ssubl           v21.4s, v1.4h, v2.4h
> +    ssubl2          v22.4s, v1.8h, v2.8h
> +
> +    add             v17.4s, v17.4s, v19.4s
> +    add             v18.4s, v18.4s, v20.4s
> +    sub             v21.4s, v21.4s, v23.4s
> +    sub             v22.4s, v22.4s, v16.4s
> +    add             v17.4s, v17.4s, v21.4s
> +    add             v18.4s, v18.4s, v22.4s
> +.endm
> +
> +
> +
> +
> +.macro vextin8
> +    ld1             {v3.16b}, [x11], #16
> +    mov             v7.d[0], v3.d[1]
> +    ext             v0.8b, v3.8b, v7.8b, #1
> +    ext             v4.8b, v3.8b, v7.8b, #2
> +    ext             v1.8b, v3.8b, v7.8b, #3
> +    ext             v5.8b, v3.8b, v7.8b, #4
> +    ext             v2.8b, v3.8b, v7.8b, #5
> +    ext             v6.8b, v3.8b, v7.8b, #6
> +    ext             v3.8b, v3.8b, v7.8b, #7
> +.endm
> +
> +
> +
> +// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +.macro HPS_FILTER a b filterhps
> +    mov             w12, #8192
> +    mov             w6, w10
> +    sub             x3, x3, #\a
> +    lsl             x3, x3, #1
> +    mov             w9, #\a
> +    cmp             w9, #4
> +    b.eq            14f
> +    cmp             w9, #12
> +    b.eq            15f
> +    b               7f
> +14:
> +    HPS_FILTER_4 \a \b \filterhps
> +    b               10f
> +15:
> +    HPS_FILTER_12 \a \b \filterhps
> +    b               10f
> +7:
> +    cmp             w5, #0
> +    b.eq            8f
> +    cmp             w5, #1
> +    b.eq            9f
> +8:
> +loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
> +    mov             w7, #\a
> +    lsr             w7, w7, #3
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    sub             v18.4s, v18.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    xtn2            v0.8h, v18.4s
> +    st1             {v0.8h}, [x2], #16
> +    subs            w7, w7, #1
> +    sub             x11, x11, #8
> +    b.ne            loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    add             x2, x2, x3
> +    b.ne            loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
> +    b               10f
> +9:
> +loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
> +    mov             w7, #\a
> +    lsr             w7, w7, #3
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    sub             v18.4s, v18.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    xtn2            v0.8h, v18.4s
> +    st1             {v0.8h}, [x2], #16
> +    subs            w7, w7, #1
> +    sub             x11, x11, #8
> +    b.ne            loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    add             x2, x2, x3
> +    b.ne            loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
> +10:
> +.endm
> +
> +.macro HPS_FILTER_4 w h filterhps
> +    cmp             w5, #0
> +    b.eq            11f
> +    cmp             w5, #1
> +    b.eq            12f
> +11:
> +loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    st1             {v0.4h}, [x2], #8
> +    sub             x11, x11, #8
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    add             x2, x2, x3
> +    b.ne            loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
> +    b               13f
> +12:
> +loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    st1             {v0.4h}, [x2], #8
> +    sub             x11, x11, #8
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    add             x2, x2, x3
> +    b.ne            loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
> +13:
> +.endm
> +
> +.macro HPS_FILTER_12 w h filterhps
> +    cmp             w5, #0
> +    b.eq            14f
> +    cmp             w5, #1
> +    b.eq            15f
> +14:
> +loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    sub             v18.4s, v18.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    xtn2            v0.8h, v18.4s
> +    st1             {v0.8h}, [x2], #16
> +    sub             x11, x11, #8
> +
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    st1             {v0.4h}, [x2], #8
> +    add             x2, x2, x3
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
> +    b               16f
> +15:
> +loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
> +    mov             x11, x0
> +    sub             x11, x11, #4
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    sub             v18.4s, v18.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    xtn2            v0.8h, v18.4s
> +    st1             {v0.8h}, [x2], #16
> +    sub             x11, x11, #8
> +
> +    vextin8
> +    \filterhps
> +    dup             v16.4s, w12
> +    sub             v17.4s, v17.4s, v16.4s
> +    xtn             v0.4h, v17.4s
> +    st1             {v0.4h}, [x2], #8
> +    add             x2, x2, x3
> +    subs            w6, w6, #1
> +    add             x0, x0, x1
> +    b.ne            loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
> +16:
> +.endm
> +
> +// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +.macro LUMA_HPS w h
> +function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
> +    mov             w10, #\h
> +    cmp             w5, #0
> +    b.eq            6f
> +    sub             x0, x0, x1, lsl #2
> +
> +    add             x0, x0, x1
> +    add             w10, w10, #7
> +6:
> +    cmp             w4, #0
> +    b.eq            0f
> +    cmp             w4, #1
> +    b.eq            1f
> +    cmp             w4, #2
> +    b.eq            2f
> +    cmp             w4, #3
> +    b.eq            3f
> +0:
> +    HPS_FILTER  \w \h qpel_filter_0_32b
> +    b               5f
> +1:
> +    HPS_FILTER  \w \h qpel_filter_1_32b
> +    b               5f
> +2:
> +    HPS_FILTER  \w \h qpel_filter_2_32b
> +    b               5f
> +3:
> +    HPS_FILTER  \w \h qpel_filter_3_32b
> +    b               5f
> +5:
> +    ret
> +endfunc
> +.endm
> +
> +LUMA_HPS    4 4
> +LUMA_HPS    4 8
> +LUMA_HPS    4 16
> +LUMA_HPS    8 4
> +LUMA_HPS    8 8
> +LUMA_HPS    8 16
> +LUMA_HPS    8 32
> +LUMA_HPS    12 16
> +LUMA_HPS    16 4
> +LUMA_HPS    16 8
> +LUMA_HPS    16 12
> +LUMA_HPS    16 16
> +LUMA_HPS    16 32
> +LUMA_HPS    16 64
> +LUMA_HPS    24 32
> +LUMA_HPS    32 8
> +LUMA_HPS    32 16
> +LUMA_HPS    32 24
> +LUMA_HPS    32 32
> +LUMA_HPS    32 64
> +LUMA_HPS    48 64
> +LUMA_HPS    64 16
> +LUMA_HPS    64 32
> +LUMA_HPS    64 48
> +LUMA_HPS    64 64
> diff --git a/source/common/aarch64/ipfilter8.h
> b/source/common/aarch64/ipfilter8.h
> new file mode 100644
> index 000000000..f9ed91e2e
> --- /dev/null
> +++ b/source/common/aarch64/ipfilter8.h
> @@ -0,0 +1,55 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_IPFILTER8_AARCH64_H
> +#define X265_IPFILTER8_AARCH64_H
> +
> +
> +void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +
> +
> +#endif // ifndef X265_IPFILTER8_AARCH64_H
> diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
> new file mode 100644
> index 000000000..cbaf9b501
> --- /dev/null
> +++ b/source/common/aarch64/mc-a.S
> @@ -0,0 +1,63 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro pixel_avg_pp_4xN_neon h
> +function x265_pixel_avg_pp_4x\h\()_neon
> +.rept \h
> +    ld1             {v0.s}[0], [x2], x3
> +    ld1             {v1.s}[0], [x4], x5
> +    urhadd          v2.8b, v0.8b, v1.8b
> +    st1             {v2.s}[0], [x0], x1
> +.endr
> +    ret
> +endfunc
> +.endm
> +
> +pixel_avg_pp_4xN_neon 4
> +pixel_avg_pp_4xN_neon 8
> +pixel_avg_pp_4xN_neon 16
> +
> +.macro pixel_avg_pp_8xN_neon h
> +function x265_pixel_avg_pp_8x\h\()_neon
> +.rept \h
> +    ld1             {v0.8b}, [x2], x3
> +    ld1             {v1.8b}, [x4], x5
> +    urhadd          v2.8b, v0.8b, v1.8b
> +    st1             {v2.8b}, [x0], x1
> +.endr
> +    ret
> +endfunc
> +.endm
> +
> +pixel_avg_pp_8xN_neon 4
> +pixel_avg_pp_8xN_neon 8
> +pixel_avg_pp_8xN_neon 16
> +pixel_avg_pp_8xN_neon 32
> diff --git a/source/common/aarch64/pixel-util.S
> b/source/common/aarch64/pixel-util.S
> new file mode 100644
> index 000000000..a085ebdfa
> --- /dev/null
> +++ b/source/common/aarch64/pixel-util.S
> @@ -0,0 +1,419 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *          Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro x265_satd_4x8_8x4_end_neon
> +    add             v0.8h, v4.8h, v6.8h
> +    add             v1.8h, v5.8h, v7.8h
> +    sub             v2.8h, v4.8h, v6.8h
> +    sub             v3.8h, v5.8h, v7.8h
> +
> +    trn1            v16.8h, v0.8h, v1.8h
> +    trn2            v17.8h, v0.8h, v1.8h
> +    add             v4.8h, v16.8h, v17.8h
> +    trn1            v18.8h, v2.8h, v3.8h
> +    trn2            v19.8h, v2.8h, v3.8h
> +    sub             v5.8h, v16.8h, v17.8h
> +    add             v6.8h, v18.8h, v19.8h
> +    sub             v7.8h, v18.8h, v19.8h
> +    trn1            v0.4s, v4.4s, v6.4s
> +    trn2            v2.4s, v4.4s, v6.4s
> +    abs             v0.8h, v0.8h
> +    trn1            v1.4s, v5.4s, v7.4s
> +    trn2            v3.4s, v5.4s, v7.4s
> +    abs             v2.8h, v2.8h
> +    abs             v1.8h, v1.8h
> +    abs             v3.8h, v3.8h
> +    umax            v0.8h, v0.8h, v2.8h
> +    umax            v1.8h, v1.8h, v3.8h
> +    add             v0.8h, v0.8h, v1.8h
> +    uaddlv          s0, v0.8h
> +.endm
> +
> +.macro pixel_satd_4x8_neon
> +    ld1r             {v1.2s}, [x2], x3
> +    ld1r            {v0.2s}, [x0], x1
> +    ld1r            {v3.2s}, [x2], x3
> +    ld1r            {v2.2s}, [x0], x1
> +    ld1r            {v5.2s}, [x2], x3
> +    ld1r            {v4.2s}, [x0], x1
> +    ld1r            {v7.2s}, [x2], x3
> +    ld1r            {v6.2s}, [x0], x1
> +
> +    ld1             {v1.s}[1], [x2], x3
> +    ld1             {v0.s}[1], [x0], x1
> +    usubl           v0.8h, v0.8b, v1.8b
> +    ld1             {v3.s}[1], [x2], x3
> +    ld1             {v2.s}[1], [x0], x1
> +    usubl           v1.8h, v2.8b, v3.8b
> +    ld1             {v5.s}[1], [x2], x3
> +    ld1             {v4.s}[1], [x0], x1
> +    usubl           v2.8h, v4.8b, v5.8b
> +    ld1             {v7.s}[1], [x2], x3
> +    add             v4.8h, v0.8h, v1.8h
> +    sub             v5.8h, v0.8h, v1.8h
> +    ld1             {v6.s}[1], [x0], x1
> +    usubl           v3.8h, v6.8b, v7.8b
> +    add         v6.8h, v2.8h, v3.8h
> +    sub         v7.8h, v2.8h, v3.8h
> +    x265_satd_4x8_8x4_end_neon
> +.endm
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x8_neon
> +    pixel_satd_4x8_neon
> +    mov               w0, v0.s[0]
> +    ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x16_neon
> +    eor             w4, w4, w4
> +    pixel_satd_4x8_neon
> +    mov               w5, v0.s[0]
> +    add             w4, w4, w5
> +    pixel_satd_4x8_neon
> +    mov               w5, v0.s[0]
> +    add             w0, w5, w4
> +    ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x32_neon
> +    eor             w4, w4, w4
> +.rept 4
> +    pixel_satd_4x8_neon
> +    mov             w5, v0.s[0]
> +    add             w4, w4, w5
> +.endr
> +    mov             w0, w4
> +    ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_12x16_neon
> +    mov             x4, x0
> +    mov             x5, x2
> +    eor             w7, w7, w7
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +
> +    add             x0, x4, #4
> +    add             x2, x5, #4
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +
> +    add             x0, x4, #8
> +    add             x2, x5, #8
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w0, w7, w6
> +    ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_12x32_neon
> +    mov             x4, x0
> +    mov             x5, x2
> +    eor             w7, w7, w7
> +.rept 4
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +.endr
> +
> +    add             x0, x4, #4
> +    add             x2, x5, #4
> +.rept 4
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +.endr
> +
> +    add             x0, x4, #8
> +    add             x2, x5, #8
> +.rept 4
> +    pixel_satd_4x8_neon
> +    mov             w6, v0.s[0]
> +    add             w7, w7, w6
> +.endr
> +
> +    mov             w0, w7
> +    ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_8x8_neon
> +    eor             w4, w4, w4
> +    mov             x6, x0
> +    mov             x7, x2
> +    pixel_satd_4x8_neon
> +    mov             w5, v0.s[0]
> +    add             w4, w4, w5
> +    add             x0, x6, #4
> +    add             x2, x7, #4
> +    pixel_satd_4x8_neon
> +    mov             w5, v0.s[0]
> +    add             w0, w4, w5
> +    ret
> +endfunc
> +
> +// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel*
> recon, intptr_t rstride)
> +function x265_psyCost_4x4_neon
> +    ld1r            {v4.2s}, [x0], x1
> +    ld1r            {v5.2s}, [x0], x1
> +    ld1             {v4.s}[1], [x0], x1
> +    ld1             {v5.s}[1], [x0], x1
> +
> +    ld1r            {v6.2s}, [x2], x3
> +    ld1r            {v7.2s}, [x2], x3
> +    ld1             {v6.s}[1], [x2], x3
> +    ld1             {v7.s}[1], [x2], x3
> +
> +    uaddl           v2.8h, v4.8b, v5.8b
> +    usubl           v3.8h, v4.8b, v5.8b
> +    uaddl           v18.8h, v6.8b, v7.8b
> +    usubl           v19.8h, v6.8b, v7.8b
> +
> +    mov             v20.d[0], v2.d[1]
> +    add             v0.4h, v2.4h, v20.4h
> +    sub             v1.4h, v2.4h, v20.4h
> +    mov             v21.d[0], v3.d[1]
> +    add             v22.4h, v3.4h, v21.4h
> +    sub             v23.4h, v3.4h, v21.4h
> +
> +    mov             v24.d[0], v18.d[1]
> +    add             v16.4h, v18.4h, v24.4h
> +    sub             v17.4h, v18.4h, v24.4h
> +    mov             v25.d[0], v19.d[1]
> +    add             v26.4h, v19.4h, v25.4h
> +    sub             v27.4h, v19.4h, v25.4h
> +
> +    mov             v0.d[1], v22.d[0]
> +    mov             v1.d[1], v23.d[0]
> +    trn1            v22.8h, v0.8h, v1.8h
> +    trn2            v23.8h, v0.8h, v1.8h
> +    mov             v16.d[1], v26.d[0]
> +    mov             v17.d[1], v27.d[0]
> +    trn1            v26.8h, v16.8h, v17.8h
> +    trn2            v27.8h, v16.8h, v17.8h
> +
> +    add             v2.8h, v22.8h, v23.8h
> +    sub             v3.8h, v22.8h, v23.8h
> +    add             v18.8h, v26.8h, v27.8h
> +    sub             v19.8h, v26.8h, v27.8h
> +
> +    uaddl           v20.8h, v4.8b, v5.8b
> +    uaddl           v21.8h, v6.8b, v7.8b
> +
> +    trn1            v0.4s, v2.4s, v3.4s
> +    trn2            v1.4s, v2.4s, v3.4s
> +    trn1            v16.4s, v18.4s, v19.4s
> +    trn2            v17.4s, v18.4s, v19.4s
> +    abs             v0.8h, v0.8h
> +    abs             v16.8h, v16.8h
> +    abs             v1.8h, v1.8h
> +    abs             v17.8h, v17.8h
> +
> +    uaddlv          s20, v20.8h
> +    uaddlv          s21, v21.8h
> +    mov             v20.s[1], v21.s[0]
> +
> +    smax            v0.8h, v0.8h, v1.8h
> +    smax            v16.8h, v16.8h, v17.8h
> +
> +    trn1            v4.2d, v0.2d, v16.2d
> +    trn2            v5.2d, v0.2d, v16.2d
> +    add             v0.8h, v4.8h, v5.8h
> +    mov             v4.d[0], v0.d[1]
> +    uaddlv          s0, v0.4h
> +    uaddlv          s4, v4.4h
> +
> +    ushr            v20.2s, v20.2s, #2
> +    mov             v0.s[1], v4.s[0]
> +    sub             v0.2s, v0.2s, v20.2s
> +    mov             w0, v0.s[0]
> +    mov             w1, v0.s[1]
> +    subs            w0, w0, w1
> +    cneg            w0, w0, mi
> +
> +    ret
> +endfunc
> +
> +// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff,
> int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
> +function x265_quant_neon
> +    mov             w9, #1
> +    lsl             w9, w9, w4
> +    dup             v0.2s, w9
> +    neg             w9, w4
> +    dup             v1.4s, w9
> +    add             w9, w9, #8
> +    dup             v2.4s, w9
> +    dup             v3.4s, w5
> +
> +    lsr             w6, w6, #2
> +    eor             v4.16b, v4.16b, v4.16b
> +    eor             w10, w10, w10
> +    eor             v17.16b, v17.16b, v17.16b
> +
> +.loop_quant:
> +
> +    ld1             {v18.4h}, [x0], #8
> +    ld1             {v7.4s}, [x1], #16
> +    sxtl            v6.4s, v18.4h
> +
> +    cmlt            v5.4s, v6.4s, #0
> +
> +    abs             v6.4s, v6.4s
> +
> +
> +    mul             v6.4s, v6.4s, v7.4s
> +
> +    add             v7.4s, v6.4s, v3.4s
> +    sshl            v7.4s, v7.4s, v1.4s
> +
> +    mls             v6.4s, v7.4s, v0.s[0]
> +    sshl            v16.4s, v6.4s, v2.4s
> +    st1             {v16.4s}, [x2], #16
> +
> +    // numsig
> +    cmeq            v16.4s, v7.4s, v17.4s
> +    add             v4.4s, v4.4s, v16.4s
> +    add             w10, w10, #4
> +
> +    // level *= sign
> +    eor             v16.16b, v7.16b, v5.16b
> +    sub             v16.4s, v16.4s, v5.4s
> +    sqxtn           v5.4h, v16.4s
> +    st1             {v5.4h}, [x3], #8
> +
> +    subs            w6, w6, #1
> +    b.ne             .loop_quant
> +
> +    addv            s4, v4.4s
> +    mov             w9, v4.s[0]
> +    add             w0, w10, w9
> +    ret
> +endfunc
> +
> +.macro satd_4x4_neon
> +    ld1             {v1.s}[0], [x2], x3
> +    ld1             {v0.s}[0], [x0], x1
> +    ld1             {v3.s}[0], [x2], x3
> +    ld1             {v2.s}[0], [x0], x1
> +
> +    ld1             {v1.s}[1], [x2], x3
> +    ld1             {v0.s}[1], [x0], x1
> +    ld1             {v3.s}[1], [x2], x3
> +    ld1             {v2.s}[1], [x0], x1
> +
> +    usubl           v4.8h, v0.8b, v1.8b
> +    usubl           v5.8h, v2.8b, v3.8b
> +
> +    add             v6.8h, v4.8h, v5.8h
> +    sub             v7.8h, v4.8h, v5.8h
> +
> +    mov             v4.d[0], v6.d[1]
> +    add             v0.8h, v6.8h, v4.8h
> +    sub             v2.8h, v6.8h, v4.8h
> +
> +    mov             v5.d[0], v7.d[1]
> +    add             v1.8h, v7.8h, v5.8h
> +    sub             v3.8h, v7.8h, v5.8h
> +
> +    trn1            v4.4h, v0.4h, v1.4h
> +    trn2            v5.4h, v0.4h, v1.4h
> +
> +    trn1            v6.4h, v2.4h, v3.4h
> +    trn2            v7.4h, v2.4h, v3.4h
> +
> +    add             v0.4h, v4.4h, v5.4h
> +    sub             v1.4h, v4.4h, v5.4h
> +
> +    add             v2.4h, v6.4h, v7.4h
> +    sub             v3.4h, v6.4h, v7.4h
> +
> +    trn1            v4.2s, v0.2s, v1.2s
> +    trn2            v5.2s, v0.2s, v1.2s
> +
> +    trn1            v6.2s, v2.2s, v3.2s
> +    trn2            v7.2s, v2.2s, v3.2s
> +
> +    abs             v4.4h, v4.4h
> +    abs             v5.4h, v5.4h
> +    abs             v6.4h, v6.4h
> +    abs             v7.4h, v7.4h
> +
> +    smax            v1.4h, v4.4h, v5.4h
> +    smax            v2.4h, v6.4h, v7.4h
> +
> +    add             v0.4h, v1.4h, v2.4h
> +    uaddlp          v0.2s, v0.4h
> +    uaddlp          v0.1d, v0.2s
> +.endm
> +
> +// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
> pix2, intptr_t stride_pix2)
> +function x265_pixel_satd_4x4_neon
> +    satd_4x4_neon
> +    umov            x0, v0.d[0]
> +    ret
> +endfunc
> +
> +// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
> pix2, intptr_t stride_pix2)
> +function x265_pixel_satd_8x4_neon
> +    mov             x4, x0
> +    mov             x5, x2
> +    satd_4x4_neon
> +    add             x0, x4, #4
> +    add             x2, x5, #4
> +    umov            x6, v0.d[0]
> +    satd_4x4_neon
> +    umov            x0, v0.d[0]
> +    add             x0, x0, x6
> +    ret
> +endfunc
> diff --git a/source/common/aarch64/pixel-util.h
> b/source/common/aarch64/pixel-util.h
> new file mode 100644
> index 000000000..043488468
> --- /dev/null
> +++ b/source/common/aarch64/pixel-util.h
> @@ -0,0 +1,40 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *          Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_PIXEL_UTIL_AARCH64_H
> +#define X265_PIXEL_UTIL_AARCH64_H
> +
> +int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +
> +uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
> int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> +int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const
> pixel* recon, intptr_t rstride);
> +
> +#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
> diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h
> new file mode 100644
> index 000000000..179c2f4ec
> --- /dev/null
> +++ b/source/common/aarch64/pixel.h
> @@ -0,0 +1,105 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_I386_PIXEL_AARCH64_H
> +#define X265_I386_PIXEL_AARCH64_H
> +
> +void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +
> +void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +
> +void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +
> +#endif // ifndef X265_I386_PIXEL_AARCH64_H
> diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
> new file mode 100644
> index 000000000..c27cce5ce
> --- /dev/null
> +++ b/source/common/aarch64/sad-a.S
> @@ -0,0 +1,105 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro SAD_X_START_8 x
> +    ld1             {v0.8b}, [x0], x9
> +.if \x == 3
> +    ld1             {v1.8b}, [x1], x4
> +    ld1             {v2.8b}, [x2], x4
> +    ld1             {v3.8b}, [x3], x4
> +.elseif \x == 4
> +    ld1             {v1.8b}, [x1], x5
> +    ld1             {v2.8b}, [x2], x5
> +    ld1             {v3.8b}, [x3], x5
> +    ld1             {v4.8b}, [x4], x5
> +.endif
> +    uabdl           v16.8h, v0.8b, v1.8b
> +    uabdl           v17.8h, v0.8b, v2.8b
> +    uabdl           v18.8h, v0.8b, v3.8b
> +.if \x == 4
> +    uabdl           v19.8h, v0.8b, v4.8b
> +.endif
> +.endm
> +
> +.macro SAD_X_8 x
> +    ld1             {v0.8b}, [x0], x9
> +.if \x == 3
> +    ld1             {v1.8b}, [x1], x4
> +    ld1             {v2.8b}, [x2], x4
> +    ld1             {v3.8b}, [x3], x4
> +.elseif \x == 4
> +    ld1             {v1.8b}, [x1], x5
> +    ld1             {v2.8b}, [x2], x5
> +    ld1             {v3.8b}, [x3], x5
> +    ld1             {v4.8b}, [x4], x5
> +.endif
> +    uabal           v16.8h, v0.8b, v1.8b
> +    uabal           v17.8h, v0.8b, v2.8b
> +    uabal           v18.8h, v0.8b, v3.8b
> +.if \x == 4
> +    uabal           v19.8h, v0.8b, v4.8b
> +.endif
> +.endm
> +
> +.macro SAD_X_8xN x, h
> +function x265_sad_x\x\()_8x\h\()_neon
> +    mov             x9, #FENC_STRIDE
> +    SAD_X_START_8 \x
> +.rept \h - 1
> +    SAD_X_8 \x
> +.endr
> +    uaddlv          s0, v16.8h
> +    uaddlv          s1, v17.8h
> +    uaddlv          s2, v18.8h
> +.if \x == 4
> +    uaddlv          s3, v19.8h
> +.endif
> +
> +.if \x == 3
> +    stp             s0, s1, [x5]
> +    str             s2, [x5, #8]
> +.elseif \x == 4
> +    stp             s0, s1, [x6]
> +    stp             s2, s3, [x6, #8]
> +.endif
> +    ret
> +endfunc
> +.endm
> +
> +SAD_X_8xN 3 4
> +SAD_X_8xN 3 8
> +SAD_X_8xN 3 16
> +SAD_X_8xN 3 32
> +
> +SAD_X_8xN 4 4
> +SAD_X_8xN 4 8
> +SAD_X_8xN 4 16
> +SAD_X_8xN 4 32
> --
> 2.21.0.windows.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>


-- 
Regards,
*Aruna Matheswaran,*
Video Codec Engineer,
Media & AI analytics BU,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20200319/44a0be4f/attachment-0001.html>


More information about the x265-devel mailing list