[x265] [PATCH] Add aarch64 support - Part 1
Aruna Matheswaran
aruna at multicorewareinc.com
Thu Mar 19 09:16:56 CET 2020
Pushed the patch series to default. Thanks!
On Thu, Feb 27, 2020 at 8:02 AM Xiyuan Wang <wangxiyuan1007 at gmail.com>
wrote:
> From: wangxiyuan <wangxiyuan at huawei.com>
>
> This patch add some common assembly optimization function for aarch64
> platform. These function won't work until the patch Part 2 is merged.
> ---
> source/common/aarch64/asm-primitives.cpp | 219 ++++++++++++
> source/common/aarch64/asm.S | 69 ++++
> source/common/aarch64/ipfilter8.S | 414 ++++++++++++++++++++++
> source/common/aarch64/ipfilter8.h | 55 +++
> source/common/aarch64/mc-a.S | 63 ++++
> source/common/aarch64/pixel-util.S | 419 +++++++++++++++++++++++
> source/common/aarch64/pixel-util.h | 40 +++
> source/common/aarch64/pixel.h | 105 ++++++
> source/common/aarch64/sad-a.S | 105 ++++++
> 9 files changed, 1489 insertions(+)
> create mode 100644 source/common/aarch64/asm-primitives.cpp
> create mode 100644 source/common/aarch64/asm.S
> create mode 100644 source/common/aarch64/ipfilter8.S
> create mode 100644 source/common/aarch64/ipfilter8.h
> create mode 100644 source/common/aarch64/mc-a.S
> create mode 100644 source/common/aarch64/pixel-util.S
> create mode 100644 source/common/aarch64/pixel-util.h
> create mode 100644 source/common/aarch64/pixel.h
> create mode 100644 source/common/aarch64/sad-a.S
>
> diff --git a/source/common/aarch64/asm-primitives.cpp
> b/source/common/aarch64/asm-primitives.cpp
> new file mode 100644
> index 000000000..6fe8c968c
> --- /dev/null
> +++ b/source/common/aarch64/asm-primitives.cpp
> @@ -0,0 +1,219 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + * Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "common.h"
> +#include "primitives.h"
> +#include "x265.h"
> +#include "cpu.h"
> +
> +
> +#if defined(__GNUC__)
> +#define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 +
> __GNUC_PATCHLEVEL__)
> +#endif
> +
> +#define GCC_4_9_0 40900
> +#define GCC_5_1_0 50100
> +
> +extern "C" {
> +#include "pixel.h"
> +#include "pixel-util.h"
> +#include "ipfilter8.h"
> +}
> +
> +namespace X265_NS {
> +// private x265 namespace
> +
> +
> +template<int size>
> +void interp_8tap_hv_pp_cpu(const pixel* src, intptr_t srcStride, pixel*
> dst, intptr_t dstStride, int idxX, int idxY)
> +{
> + ALIGN_VAR_32(int16_t, immed[MAX_CU_SIZE * (MAX_CU_SIZE + NTAPS_LUMA -
> 1)]);
> + const int halfFilterSize = NTAPS_LUMA >> 1;
> + const int immedStride = MAX_CU_SIZE;
> +
> + primitives.pu[size].luma_hps(src, srcStride, immed, immedStride,
> idxX, 1);
> + primitives.pu[size].luma_vsp(immed + (halfFilterSize - 1) *
> immedStride, immedStride, dst, dstStride, idxY);
> +}
> +
> +
> +/* Temporary workaround because luma_vsp assembly primitive has not been
> completed
> + * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly
> primitive.
> + * Otherwise, segment fault occurs. */
> +void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives
> &asmp, int cpuMask)
> +{
> + if (cpuMask & X265_CPU_NEON)
> + {
> + asmp.pu[LUMA_8x4].luma_vsp = cp.pu[LUMA_8x4].luma_vsp;
> + asmp.pu[LUMA_8x8].luma_vsp = cp.pu[LUMA_8x8].luma_vsp;
> + asmp.pu[LUMA_8x16].luma_vsp = cp.pu[LUMA_8x16].luma_vsp;
> + asmp.pu[LUMA_8x32].luma_vsp = cp.pu[LUMA_8x32].luma_vsp;
> + asmp.pu[LUMA_12x16].luma_vsp = cp.pu[LUMA_12x16].luma_vsp;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> + asmp.pu[LUMA_16x4].luma_vsp = cp.pu[LUMA_16x4].luma_vsp;
> + asmp.pu[LUMA_16x8].luma_vsp = cp.pu[LUMA_16x8].luma_vsp;
> + asmp.pu[LUMA_16x12].luma_vsp = cp.pu[LUMA_16x12].luma_vsp;
> + asmp.pu[LUMA_16x16].luma_vsp = cp.pu[LUMA_16x16].luma_vsp;
> + asmp.pu[LUMA_16x32].luma_vsp = cp.pu[LUMA_16x32].luma_vsp;
> + asmp.pu[LUMA_16x64].luma_vsp = cp.pu[LUMA_16x64].luma_vsp;
> + asmp.pu[LUMA_32x16].luma_vsp = cp.pu[LUMA_32x16].luma_vsp;
> + asmp.pu[LUMA_32x24].luma_vsp = cp.pu[LUMA_32x24].luma_vsp;
> + asmp.pu[LUMA_32x32].luma_vsp = cp.pu[LUMA_32x32].luma_vsp;
> + asmp.pu[LUMA_32x64].luma_vsp = cp.pu[LUMA_32x64].luma_vsp;
> + asmp.pu[LUMA_48x64].luma_vsp = cp.pu[LUMA_48x64].luma_vsp;
> + asmp.pu[LUMA_64x16].luma_vsp = cp.pu[LUMA_64x16].luma_vsp;
> + asmp.pu[LUMA_64x32].luma_vsp = cp.pu[LUMA_64x32].luma_vsp;
> + asmp.pu[LUMA_64x48].luma_vsp = cp.pu[LUMA_64x48].luma_vsp;
> + asmp.pu[LUMA_64x64].luma_vsp = cp.pu[LUMA_64x64].luma_vsp;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
> */
> + asmp.pu[LUMA_4x4].luma_vsp = cp.pu[LUMA_4x4].luma_vsp;
> + asmp.pu[LUMA_4x8].luma_vsp = cp.pu[LUMA_4x8].luma_vsp;
> + asmp.pu[LUMA_4x16].luma_vsp = cp.pu[LUMA_4x16].luma_vsp;
> + asmp.pu[LUMA_24x32].luma_vsp = cp.pu[LUMA_24x32].luma_vsp;
> + asmp.pu[LUMA_32x8].luma_vsp = cp.pu[LUMA_32x8].luma_vsp;
> +#endif
> +#endif
> + }
> +}
> +
> +
> +void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
> +{
> + if (cpuMask & X265_CPU_NEON)
> + {
> + p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_neon);
> + p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
> + p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);
> + p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_neon);
> + p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);
> + p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
> +
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd =
> PFX(pixel_satd_4x4_neon);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd =
> PFX(pixel_satd_4x8_neon);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd =
> PFX(pixel_satd_4x16_neon);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd =
> PFX(pixel_satd_8x4_neon);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd =
> PFX(pixel_satd_8x8_neon);
> + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd =
> PFX(pixel_satd_12x16_neon);
> +
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd =
> PFX(pixel_satd_4x4_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd =
> PFX(pixel_satd_4x8_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd =
> PFX(pixel_satd_4x16_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd =
> PFX(pixel_satd_4x32_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd =
> PFX(pixel_satd_8x4_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd =
> PFX(pixel_satd_8x8_neon);
> + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd =
> PFX(pixel_satd_12x32_neon);
> +
> + p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_4x4_neon);
> + p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_4x8_neon);
> + p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_4x16_neon);
> + p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_8x4_neon);
> + p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_8x8_neon);
> + p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_8x16_neon);
> + p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_8x32_neon);
> +
> + p.pu[LUMA_4x4].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_4x4_neon);
> + p.pu[LUMA_4x8].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_4x8_neon);
> + p.pu[LUMA_4x16].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_4x16_neon);
> + p.pu[LUMA_8x4].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_8x4_neon);
> + p.pu[LUMA_8x8].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_8x8_neon);
> + p.pu[LUMA_8x16].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_8x16_neon);
> + p.pu[LUMA_8x32].pixelavg_pp[ALIGNED] =
> PFX(pixel_avg_pp_8x32_neon);
> +
> + p.pu[LUMA_8x4].sad_x3 = PFX(sad_x3_8x4_neon);
> + p.pu[LUMA_8x8].sad_x3 = PFX(sad_x3_8x8_neon);
> + p.pu[LUMA_8x16].sad_x3 = PFX(sad_x3_8x16_neon);
> + p.pu[LUMA_8x32].sad_x3 = PFX(sad_x3_8x32_neon);
> +
> + p.pu[LUMA_8x4].sad_x4 = PFX(sad_x4_8x4_neon);
> + p.pu[LUMA_8x8].sad_x4 = PFX(sad_x4_8x8_neon);
> + p.pu[LUMA_8x16].sad_x4 = PFX(sad_x4_8x16_neon);
> + p.pu[LUMA_8x32].sad_x4 = PFX(sad_x4_8x32_neon);
> +
> + // quant
> + p.quant = PFX(quant_neon);
> + // luma_hps
> + p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_neon);
> + p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_neon);
> + p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_neon);
> + p.pu[LUMA_8x4].luma_hps = PFX(interp_8tap_horiz_ps_8x4_neon);
> + p.pu[LUMA_8x8].luma_hps = PFX(interp_8tap_horiz_ps_8x8_neon);
> + p.pu[LUMA_8x16].luma_hps = PFX(interp_8tap_horiz_ps_8x16_neon);
> + p.pu[LUMA_8x32].luma_hps = PFX(interp_8tap_horiz_ps_8x32_neon);
> + p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_neon);
> + p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_neon);
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> + p.pu[LUMA_16x4].luma_hps = PFX(interp_8tap_horiz_ps_16x4_neon);
> + p.pu[LUMA_16x8].luma_hps = PFX(interp_8tap_horiz_ps_16x8_neon);
> + p.pu[LUMA_16x12].luma_hps = PFX(interp_8tap_horiz_ps_16x12_neon);
> + p.pu[LUMA_16x16].luma_hps = PFX(interp_8tap_horiz_ps_16x16_neon);
> + p.pu[LUMA_16x32].luma_hps = PFX(interp_8tap_horiz_ps_16x32_neon);
> + p.pu[LUMA_16x64].luma_hps = PFX(interp_8tap_horiz_ps_16x64_neon);
> + p.pu[LUMA_32x8].luma_hps = PFX(interp_8tap_horiz_ps_32x8_neon);
> + p.pu[LUMA_32x16].luma_hps = PFX(interp_8tap_horiz_ps_32x16_neon);
> + p.pu[LUMA_32x24].luma_hps = PFX(interp_8tap_horiz_ps_32x24_neon);
> + p.pu[LUMA_32x32].luma_hps = PFX(interp_8tap_horiz_ps_32x32_neon);
> + p.pu[LUMA_32x64].luma_hps = PFX(interp_8tap_horiz_ps_32x64_neon);
> + p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_neon);
> + p.pu[LUMA_64x16].luma_hps = PFX(interp_8tap_horiz_ps_64x16_neon);
> + p.pu[LUMA_64x32].luma_hps = PFX(interp_8tap_horiz_ps_64x32_neon);
> + p.pu[LUMA_64x48].luma_hps = PFX(interp_8tap_horiz_ps_64x48_neon);
> + p.pu[LUMA_64x64].luma_hps = PFX(interp_8tap_horiz_ps_64x64_neon);
> +#endif
> +
> + p.pu[LUMA_8x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x4>;
> + p.pu[LUMA_8x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x8>;
> + p.pu[LUMA_8x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x16>;
> + p.pu[LUMA_8x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_8x32>;
> + p.pu[LUMA_12x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_12x16>;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_5_1_0 /* gcc_version < gcc-5.1.0
> */
> + p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x4>;
> + p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x8>;
> + p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x12>;
> + p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x16>;
> + p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x32>;
> + p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_16x64>;
> + p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x16>;
> + p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x24>;
> + p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x32>;
> + p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x64>;
> + p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_48x64>;
> + p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x16>;
> + p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x32>;
> + p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x48>;
> + p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_64x64>;
> +#if !AUTO_VECTORIZE || GCC_VERSION < GCC_4_9_0 /* gcc_version < gcc-4.9.0
> */
> + p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;
> + p.pu[LUMA_4x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x8>;
> + p.pu[LUMA_4x16].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x16>;
> + p.pu[LUMA_24x32].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_24x32>;
> + p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_32x8>;
> +#endif
> +#endif
> +
> +#if !HIGH_BIT_DEPTH
> + p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
> +#endif // !HIGH_BIT_DEPTH
> +
> + }
> +}
> +} // namespace X265_NS
> diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
> new file mode 100644
> index 000000000..5f020a11a
> --- /dev/null
> +++ b/source/common/aarch64/asm.S
> @@ -0,0 +1,69 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +.arch armv8-a
> +
> +#ifdef PREFIX
> +#define EXTERN_ASM _
> +#else
> +#define EXTERN_ASM
> +#endif
> +
> +#ifdef __ELF__
> +#define ELF
> +#else
> +#define ELF @
> +#endif
> +
> +#define HAVE_AS_FUNC 1
> +
> +#if HAVE_AS_FUNC
> +#define FUNC
> +#else
> +#define FUNC @
> +#endif
> +
> +.macro function name, export=1
> + .macro endfunc
> +ELF .size \name, . - \name
> +FUNC .endfunc
> + .purgem endfunc
> + .endm
> + .align 2
> +.if \export == 1
> + .global EXTERN_ASM\name
> +ELF .hidden EXTERN_ASM\name
> +ELF .type EXTERN_ASM\name, %function
> +FUNC .func EXTERN_ASM\name
> +EXTERN_ASM\name:
> +.else
> +ELF .hidden \name
> +ELF .type \name, %function
> +FUNC .func \name
> +\name:
> +.endif
> +.endm
> +
> +
> +#define FENC_STRIDE 64
> +#define FDEC_STRIDE 32
> diff --git a/source/common/aarch64/ipfilter8.S
> b/source/common/aarch64/ipfilter8.S
> new file mode 100644
> index 000000000..908c7db46
> --- /dev/null
> +++ b/source/common/aarch64/ipfilter8.S
> @@ -0,0 +1,414 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +
> +
> +.macro qpel_filter_0_32b
> + movi v24.8h, #64
> + uxtl v19.8h, v5.8b
> + smull v17.4s, v19.4h, v24.4h
> + smull2 v18.4s, v19.8h, v24.8h
> +.endm
> +
> +.macro qpel_filter_1_32b
> + movi v16.8h, #58
> + uxtl v19.8h, v5.8b
> + smull v17.4s, v19.4h, v16.4h
> + smull2 v18.4s, v19.8h, v16.8h
> +
> + movi v24.8h, #10
> + uxtl v21.8h, v1.8b
> + smull v19.4s, v21.4h, v24.4h
> + smull2 v20.4s, v21.8h, v24.8h
> +
> + movi v16.8h, #17
> + uxtl v23.8h, v2.8b
> + smull v21.4s, v23.4h, v16.4h
> + smull2 v22.4s, v23.8h, v16.8h
> +
> + movi v24.8h, #5
> + uxtl v1.8h, v6.8b
> + smull v23.4s, v1.4h, v24.4h
> + smull2 v16.4s, v1.8h, v24.8h
> +
> + sub v17.4s, v17.4s, v19.4s
> + sub v18.4s, v18.4s, v20.4s
> +
> + uxtl v1.8h, v4.8b
> + sshll v19.4s, v1.4h, #2
> + sshll2 v20.4s, v1.8h, #2
> +
> + add v17.4s, v17.4s, v21.4s
> + add v18.4s, v18.4s, v22.4s
> +
> + uxtl v1.8h, v0.8b
> + uxtl v2.8h, v3.8b
> + ssubl v21.4s, v2.4h, v1.4h
> + ssubl2 v22.4s, v2.8h, v1.8h
> +
> + add v17.4s, v17.4s, v19.4s
> + add v18.4s, v18.4s, v20.4s
> + sub v21.4s, v21.4s, v23.4s
> + sub v22.4s, v22.4s, v16.4s
> + add v17.4s, v17.4s, v21.4s
> + add v18.4s, v18.4s, v22.4s
> +.endm
> +
> +.macro qpel_filter_2_32b
> + movi v16.4s, #11
> + uxtl v19.8h, v5.8b
> + uxtl v20.8h, v2.8b
> + saddl v17.4s, v19.4h, v20.4h
> + saddl2 v18.4s, v19.8h, v20.8h
> +
> + uxtl v21.8h, v1.8b
> + uxtl v22.8h, v6.8b
> + saddl v19.4s, v21.4h, v22.4h
> + saddl2 v20.4s, v21.8h, v22.8h
> +
> + mul v19.4s, v19.4s, v16.4s
> + mul v20.4s, v20.4s, v16.4s
> +
> + movi v16.4s, #40
> + mul v17.4s, v17.4s, v16.4s
> + mul v18.4s, v18.4s, v16.4s
> +
> + uxtl v21.8h, v4.8b
> + uxtl v22.8h, v3.8b
> + saddl v23.4s, v21.4h, v22.4h
> + saddl2 v16.4s, v21.8h, v22.8h
> +
> + uxtl v1.8h, v0.8b
> + uxtl v2.8h, v7.8b
> + saddl v21.4s, v1.4h, v2.4h
> + saddl2 v22.4s, v1.8h, v2.8h
> +
> + shl v23.4s, v23.4s, #2
> + shl v16.4s, v16.4s, #2
> +
> + add v19.4s, v19.4s, v21.4s
> + add v20.4s, v20.4s, v22.4s
> + add v17.4s, v17.4s, v23.4s
> + add v18.4s, v18.4s, v16.4s
> + sub v17.4s, v17.4s, v19.4s
> + sub v18.4s, v18.4s, v20.4s
> +.endm
> +
> +.macro qpel_filter_3_32b
> + movi v16.8h, #17
> + movi v24.8h, #5
> +
> + uxtl v19.8h, v5.8b
> + smull v17.4s, v19.4h, v16.4h
> + smull2 v18.4s, v19.8h, v16.8h
> +
> + uxtl v21.8h, v1.8b
> + smull v19.4s, v21.4h, v24.4h
> + smull2 v20.4s, v21.8h, v24.8h
> +
> + movi v16.8h, #58
> + uxtl v23.8h, v2.8b
> + smull v21.4s, v23.4h, v16.4h
> + smull2 v22.4s, v23.8h, v16.8h
> +
> + movi v24.8h, #10
> + uxtl v1.8h, v6.8b
> + smull v23.4s, v1.4h, v24.4h
> + smull2 v16.4s, v1.8h, v24.8h
> +
> + sub v17.4s, v17.4s, v19.4s
> + sub v18.4s, v18.4s, v20.4s
> +
> + uxtl v1.8h, v3.8b
> + sshll v19.4s, v1.4h, #2
> + sshll2 v20.4s, v1.8h, #2
> +
> + add v17.4s, v17.4s, v21.4s
> + add v18.4s, v18.4s, v22.4s
> +
> + uxtl v1.8h, v4.8b
> + uxtl v2.8h, v7.8b
> + ssubl v21.4s, v1.4h, v2.4h
> + ssubl2 v22.4s, v1.8h, v2.8h
> +
> + add v17.4s, v17.4s, v19.4s
> + add v18.4s, v18.4s, v20.4s
> + sub v21.4s, v21.4s, v23.4s
> + sub v22.4s, v22.4s, v16.4s
> + add v17.4s, v17.4s, v21.4s
> + add v18.4s, v18.4s, v22.4s
> +.endm
> +
> +
> +
> +
> +.macro vextin8
> + ld1 {v3.16b}, [x11], #16
> + mov v7.d[0], v3.d[1]
> + ext v0.8b, v3.8b, v7.8b, #1
> + ext v4.8b, v3.8b, v7.8b, #2
> + ext v1.8b, v3.8b, v7.8b, #3
> + ext v5.8b, v3.8b, v7.8b, #4
> + ext v2.8b, v3.8b, v7.8b, #5
> + ext v6.8b, v3.8b, v7.8b, #6
> + ext v3.8b, v3.8b, v7.8b, #7
> +.endm
> +
> +
> +
> +// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +.macro HPS_FILTER a b filterhps
> + mov w12, #8192
> + mov w6, w10
> + sub x3, x3, #\a
> + lsl x3, x3, #1
> + mov w9, #\a
> + cmp w9, #4
> + b.eq 14f
> + cmp w9, #12
> + b.eq 15f
> + b 7f
> +14:
> + HPS_FILTER_4 \a \b \filterhps
> + b 10f
> +15:
> + HPS_FILTER_12 \a \b \filterhps
> + b 10f
> +7:
> + cmp w5, #0
> + b.eq 8f
> + cmp w5, #1
> + b.eq 9f
> +8:
> +loop1_hps_\filterhps\()_\a\()x\b\()_rowext0:
> + mov w7, #\a
> + lsr w7, w7, #3
> + mov x11, x0
> + sub x11, x11, #4
> +loop2_hps_\filterhps\()_\a\()x\b\()_rowext0:
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + sub v18.4s, v18.4s, v16.4s
> + xtn v0.4h, v17.4s
> + xtn2 v0.8h, v18.4s
> + st1 {v0.8h}, [x2], #16
> + subs w7, w7, #1
> + sub x11, x11, #8
> + b.ne loop2_hps_\filterhps\()_\a\()x\b\()_rowext0
> + subs w6, w6, #1
> + add x0, x0, x1
> + add x2, x2, x3
> + b.ne loop1_hps_\filterhps\()_\a\()x\b\()_rowext0
> + b 10f
> +9:
> +loop3_hps_\filterhps\()_\a\()x\b\()_rowext1:
> + mov w7, #\a
> + lsr w7, w7, #3
> + mov x11, x0
> + sub x11, x11, #4
> +loop4_hps_\filterhps\()_\a\()x\b\()_rowext1:
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + sub v18.4s, v18.4s, v16.4s
> + xtn v0.4h, v17.4s
> + xtn2 v0.8h, v18.4s
> + st1 {v0.8h}, [x2], #16
> + subs w7, w7, #1
> + sub x11, x11, #8
> + b.ne loop4_hps_\filterhps\()_\a\()x\b\()_rowext1
> + subs w6, w6, #1
> + add x0, x0, x1
> + add x2, x2, x3
> + b.ne loop3_hps_\filterhps\()_\a\()x\b\()_rowext1
> +10:
> +.endm
> +
> +.macro HPS_FILTER_4 w h filterhps
> + cmp w5, #0
> + b.eq 11f
> + cmp w5, #1
> + b.eq 12f
> +11:
> +loop4_hps_\filterhps\()_\w\()x\h\()_rowext0:
> + mov x11, x0
> + sub x11, x11, #4
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + xtn v0.4h, v17.4s
> + st1 {v0.4h}, [x2], #8
> + sub x11, x11, #8
> + subs w6, w6, #1
> + add x0, x0, x1
> + add x2, x2, x3
> + b.ne loop4_hps_\filterhps\()_\w\()x\h\()_rowext0
> + b 13f
> +12:
> +loop5_hps_\filterhps\()_\w\()x\h\()_rowext1:
> + mov x11, x0
> + sub x11, x11, #4
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + xtn v0.4h, v17.4s
> + st1 {v0.4h}, [x2], #8
> + sub x11, x11, #8
> + subs w6, w6, #1
> + add x0, x0, x1
> + add x2, x2, x3
> + b.ne loop5_hps_\filterhps\()_\w\()x\h\()_rowext1
> +13:
> +.endm
> +
> +.macro HPS_FILTER_12 w h filterhps
> + cmp w5, #0
> + b.eq 14f
> + cmp w5, #1
> + b.eq 15f
> +14:
> +loop12_hps_\filterhps\()_\w\()x\h\()_rowext0:
> + mov x11, x0
> + sub x11, x11, #4
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + sub v18.4s, v18.4s, v16.4s
> + xtn v0.4h, v17.4s
> + xtn2 v0.8h, v18.4s
> + st1 {v0.8h}, [x2], #16
> + sub x11, x11, #8
> +
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + xtn v0.4h, v17.4s
> + st1 {v0.4h}, [x2], #8
> + add x2, x2, x3
> + subs w6, w6, #1
> + add x0, x0, x1
> + b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext0
> + b 16f
> +15:
> +loop12_hps_\filterhps\()_\w\()x\h\()_rowext1:
> + mov x11, x0
> + sub x11, x11, #4
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + sub v18.4s, v18.4s, v16.4s
> + xtn v0.4h, v17.4s
> + xtn2 v0.8h, v18.4s
> + st1 {v0.8h}, [x2], #16
> + sub x11, x11, #8
> +
> + vextin8
> + \filterhps
> + dup v16.4s, w12
> + sub v17.4s, v17.4s, v16.4s
> + xtn v0.4h, v17.4s
> + st1 {v0.4h}, [x2], #8
> + add x2, x2, x3
> + subs w6, w6, #1
> + add x0, x0, x1
> + b.ne loop12_hps_\filterhps\()_\w\()x\h\()_rowext1
> +16:
> +.endm
> +
> +// void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t*
> dst, intptr_t dstStride, int coeffIdx, int isRowExt)
> +.macro LUMA_HPS w h
> +function x265_interp_8tap_horiz_ps_\w\()x\h\()_neon
> + mov w10, #\h
> + cmp w5, #0
> + b.eq 6f
> + sub x0, x0, x1, lsl #2
> +
> + add x0, x0, x1
> + add w10, w10, #7
> +6:
> + cmp w4, #0
> + b.eq 0f
> + cmp w4, #1
> + b.eq 1f
> + cmp w4, #2
> + b.eq 2f
> + cmp w4, #3
> + b.eq 3f
> +0:
> + HPS_FILTER \w \h qpel_filter_0_32b
> + b 5f
> +1:
> + HPS_FILTER \w \h qpel_filter_1_32b
> + b 5f
> +2:
> + HPS_FILTER \w \h qpel_filter_2_32b
> + b 5f
> +3:
> + HPS_FILTER \w \h qpel_filter_3_32b
> + b 5f
> +5:
> + ret
> +endfunc
> +.endm
> +
> +LUMA_HPS 4 4
> +LUMA_HPS 4 8
> +LUMA_HPS 4 16
> +LUMA_HPS 8 4
> +LUMA_HPS 8 8
> +LUMA_HPS 8 16
> +LUMA_HPS 8 32
> +LUMA_HPS 12 16
> +LUMA_HPS 16 4
> +LUMA_HPS 16 8
> +LUMA_HPS 16 12
> +LUMA_HPS 16 16
> +LUMA_HPS 16 32
> +LUMA_HPS 16 64
> +LUMA_HPS 24 32
> +LUMA_HPS 32 8
> +LUMA_HPS 32 16
> +LUMA_HPS 32 24
> +LUMA_HPS 32 32
> +LUMA_HPS 32 64
> +LUMA_HPS 48 64
> +LUMA_HPS 64 16
> +LUMA_HPS 64 32
> +LUMA_HPS 64 48
> +LUMA_HPS 64 64
> diff --git a/source/common/aarch64/ipfilter8.h
> b/source/common/aarch64/ipfilter8.h
> new file mode 100644
> index 000000000..f9ed91e2e
> --- /dev/null
> +++ b/source/common/aarch64/ipfilter8.h
> @@ -0,0 +1,55 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_IPFILTER8_AARCH64_H
> +#define X265_IPFILTER8_AARCH64_H
> +
> +
> +void x265_interp_8tap_horiz_ps_4x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_4x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_8x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_12x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x4_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x12_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_16x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_24x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x8_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x24_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_32x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_48x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x16_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x32_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x48_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +void x265_interp_8tap_horiz_ps_64x64_neon(const pixel* src, intptr_t
> srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
> +
> +
> +#endif // ifndef X265_IPFILTER8_AARCH64_H
> diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
> new file mode 100644
> index 000000000..cbaf9b501
> --- /dev/null
> +++ b/source/common/aarch64/mc-a.S
> @@ -0,0 +1,63 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro pixel_avg_pp_4xN_neon h
> +function x265_pixel_avg_pp_4x\h\()_neon
> +.rept \h
> + ld1 {v0.s}[0], [x2], x3
> + ld1 {v1.s}[0], [x4], x5
> + urhadd v2.8b, v0.8b, v1.8b
> + st1 {v2.s}[0], [x0], x1
> +.endr
> + ret
> +endfunc
> +.endm
> +
> +pixel_avg_pp_4xN_neon 4
> +pixel_avg_pp_4xN_neon 8
> +pixel_avg_pp_4xN_neon 16
> +
> +.macro pixel_avg_pp_8xN_neon h
> +function x265_pixel_avg_pp_8x\h\()_neon
> +.rept \h
> + ld1 {v0.8b}, [x2], x3
> + ld1 {v1.8b}, [x4], x5
> + urhadd v2.8b, v0.8b, v1.8b
> + st1 {v2.8b}, [x0], x1
> +.endr
> + ret
> +endfunc
> +.endm
> +
> +pixel_avg_pp_8xN_neon 4
> +pixel_avg_pp_8xN_neon 8
> +pixel_avg_pp_8xN_neon 16
> +pixel_avg_pp_8xN_neon 32
> diff --git a/source/common/aarch64/pixel-util.S
> b/source/common/aarch64/pixel-util.S
> new file mode 100644
> index 000000000..a085ebdfa
> --- /dev/null
> +++ b/source/common/aarch64/pixel-util.S
> @@ -0,0 +1,419 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + * Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro x265_satd_4x8_8x4_end_neon
> + add v0.8h, v4.8h, v6.8h
> + add v1.8h, v5.8h, v7.8h
> + sub v2.8h, v4.8h, v6.8h
> + sub v3.8h, v5.8h, v7.8h
> +
> + trn1 v16.8h, v0.8h, v1.8h
> + trn2 v17.8h, v0.8h, v1.8h
> + add v4.8h, v16.8h, v17.8h
> + trn1 v18.8h, v2.8h, v3.8h
> + trn2 v19.8h, v2.8h, v3.8h
> + sub v5.8h, v16.8h, v17.8h
> + add v6.8h, v18.8h, v19.8h
> + sub v7.8h, v18.8h, v19.8h
> + trn1 v0.4s, v4.4s, v6.4s
> + trn2 v2.4s, v4.4s, v6.4s
> + abs v0.8h, v0.8h
> + trn1 v1.4s, v5.4s, v7.4s
> + trn2 v3.4s, v5.4s, v7.4s
> + abs v2.8h, v2.8h
> + abs v1.8h, v1.8h
> + abs v3.8h, v3.8h
> + umax v0.8h, v0.8h, v2.8h
> + umax v1.8h, v1.8h, v3.8h
> + add v0.8h, v0.8h, v1.8h
> + uaddlv s0, v0.8h
> +.endm
> +
> +.macro pixel_satd_4x8_neon
> + ld1r {v1.2s}, [x2], x3
> + ld1r {v0.2s}, [x0], x1
> + ld1r {v3.2s}, [x2], x3
> + ld1r {v2.2s}, [x0], x1
> + ld1r {v5.2s}, [x2], x3
> + ld1r {v4.2s}, [x0], x1
> + ld1r {v7.2s}, [x2], x3
> + ld1r {v6.2s}, [x0], x1
> +
> + ld1 {v1.s}[1], [x2], x3
> + ld1 {v0.s}[1], [x0], x1
> + usubl v0.8h, v0.8b, v1.8b
> + ld1 {v3.s}[1], [x2], x3
> + ld1 {v2.s}[1], [x0], x1
> + usubl v1.8h, v2.8b, v3.8b
> + ld1 {v5.s}[1], [x2], x3
> + ld1 {v4.s}[1], [x0], x1
> + usubl v2.8h, v4.8b, v5.8b
> + ld1 {v7.s}[1], [x2], x3
> + add v4.8h, v0.8h, v1.8h
> + sub v5.8h, v0.8h, v1.8h
> + ld1 {v6.s}[1], [x0], x1
> + usubl v3.8h, v6.8b, v7.8b
> + add v6.8h, v2.8h, v3.8h
> + sub v7.8h, v2.8h, v3.8h
> + x265_satd_4x8_8x4_end_neon
> +.endm
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x8_neon
> + pixel_satd_4x8_neon
> + mov w0, v0.s[0]
> + ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x16_neon
> + eor w4, w4, w4
> + pixel_satd_4x8_neon
> + mov w5, v0.s[0]
> + add w4, w4, w5
> + pixel_satd_4x8_neon
> + mov w5, v0.s[0]
> + add w0, w5, w4
> + ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_4x32_neon
> + eor w4, w4, w4
> +.rept 4
> + pixel_satd_4x8_neon
> + mov w5, v0.s[0]
> + add w4, w4, w5
> +.endr
> + mov w0, w4
> + ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_12x16_neon
> + mov x4, x0
> + mov x5, x2
> + eor w7, w7, w7
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> +
> + add x0, x4, #4
> + add x2, x5, #4
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> +
> + add x0, x4, #8
> + add x2, x5, #8
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w0, w7, w6
> + ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_12x32_neon
> + mov x4, x0
> + mov x5, x2
> + eor w7, w7, w7
> +.rept 4
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> +.endr
> +
> + add x0, x4, #4
> + add x2, x5, #4
> +.rept 4
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> +.endr
> +
> + add x0, x4, #8
> + add x2, x5, #8
> +.rept 4
> + pixel_satd_4x8_neon
> + mov w6, v0.s[0]
> + add w7, w7, w6
> +.endr
> +
> + mov w0, w7
> + ret
> +endfunc
> +
> +// template<int w, int h>
> +// int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2,
> intptr_t stride_pix2)
> +function x265_pixel_satd_8x8_neon
> + eor w4, w4, w4
> + mov x6, x0
> + mov x7, x2
> + pixel_satd_4x8_neon
> + mov w5, v0.s[0]
> + add w4, w4, w5
> + add x0, x6, #4
> + add x2, x7, #4
> + pixel_satd_4x8_neon
> + mov w5, v0.s[0]
> + add w0, w4, w5
> + ret
> +endfunc
> +
> +// int psyCost_pp(const pixel* source, intptr_t sstride, const pixel*
> recon, intptr_t rstride)
> +function x265_psyCost_4x4_neon
> + ld1r {v4.2s}, [x0], x1
> + ld1r {v5.2s}, [x0], x1
> + ld1 {v4.s}[1], [x0], x1
> + ld1 {v5.s}[1], [x0], x1
> +
> + ld1r {v6.2s}, [x2], x3
> + ld1r {v7.2s}, [x2], x3
> + ld1 {v6.s}[1], [x2], x3
> + ld1 {v7.s}[1], [x2], x3
> +
> + uaddl v2.8h, v4.8b, v5.8b
> + usubl v3.8h, v4.8b, v5.8b
> + uaddl v18.8h, v6.8b, v7.8b
> + usubl v19.8h, v6.8b, v7.8b
> +
> + mov v20.d[0], v2.d[1]
> + add v0.4h, v2.4h, v20.4h
> + sub v1.4h, v2.4h, v20.4h
> + mov v21.d[0], v3.d[1]
> + add v22.4h, v3.4h, v21.4h
> + sub v23.4h, v3.4h, v21.4h
> +
> + mov v24.d[0], v18.d[1]
> + add v16.4h, v18.4h, v24.4h
> + sub v17.4h, v18.4h, v24.4h
> + mov v25.d[0], v19.d[1]
> + add v26.4h, v19.4h, v25.4h
> + sub v27.4h, v19.4h, v25.4h
> +
> + mov v0.d[1], v22.d[0]
> + mov v1.d[1], v23.d[0]
> + trn1 v22.8h, v0.8h, v1.8h
> + trn2 v23.8h, v0.8h, v1.8h
> + mov v16.d[1], v26.d[0]
> + mov v17.d[1], v27.d[0]
> + trn1 v26.8h, v16.8h, v17.8h
> + trn2 v27.8h, v16.8h, v17.8h
> +
> + add v2.8h, v22.8h, v23.8h
> + sub v3.8h, v22.8h, v23.8h
> + add v18.8h, v26.8h, v27.8h
> + sub v19.8h, v26.8h, v27.8h
> +
> + uaddl v20.8h, v4.8b, v5.8b
> + uaddl v21.8h, v6.8b, v7.8b
> +
> + trn1 v0.4s, v2.4s, v3.4s
> + trn2 v1.4s, v2.4s, v3.4s
> + trn1 v16.4s, v18.4s, v19.4s
> + trn2 v17.4s, v18.4s, v19.4s
> + abs v0.8h, v0.8h
> + abs v16.8h, v16.8h
> + abs v1.8h, v1.8h
> + abs v17.8h, v17.8h
> +
> + uaddlv s20, v20.8h
> + uaddlv s21, v21.8h
> + mov v20.s[1], v21.s[0]
> +
> + smax v0.8h, v0.8h, v1.8h
> + smax v16.8h, v16.8h, v17.8h
> +
> + trn1 v4.2d, v0.2d, v16.2d
> + trn2 v5.2d, v0.2d, v16.2d
> + add v0.8h, v4.8h, v5.8h
> + mov v4.d[0], v0.d[1]
> + uaddlv s0, v0.4h
> + uaddlv s4, v4.4h
> +
> + ushr v20.2s, v20.2s, #2
> + mov v0.s[1], v4.s[0]
> + sub v0.2s, v0.2s, v20.2s
> + mov w0, v0.s[0]
> + mov w1, v0.s[1]
> + subs w0, w0, w1
> + cneg w0, w0, mi
> +
> + ret
> +endfunc
> +
> +// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff,
> int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
> +function x265_quant_neon
> + mov w9, #1
> + lsl w9, w9, w4
> + dup v0.2s, w9
> + neg w9, w4
> + dup v1.4s, w9
> + add w9, w9, #8
> + dup v2.4s, w9
> + dup v3.4s, w5
> +
> + lsr w6, w6, #2
> + eor v4.16b, v4.16b, v4.16b
> + eor w10, w10, w10
> + eor v17.16b, v17.16b, v17.16b
> +
> +.loop_quant:
> +
> + ld1 {v18.4h}, [x0], #8
> + ld1 {v7.4s}, [x1], #16
> + sxtl v6.4s, v18.4h
> +
> + cmlt v5.4s, v6.4s, #0
> +
> + abs v6.4s, v6.4s
> +
> +
> + mul v6.4s, v6.4s, v7.4s
> +
> + add v7.4s, v6.4s, v3.4s
> + sshl v7.4s, v7.4s, v1.4s
> +
> + mls v6.4s, v7.4s, v0.s[0]
> + sshl v16.4s, v6.4s, v2.4s
> + st1 {v16.4s}, [x2], #16
> +
> + // numsig
> + cmeq v16.4s, v7.4s, v17.4s
> + add v4.4s, v4.4s, v16.4s
> + add w10, w10, #4
> +
> + // level *= sign
> + eor v16.16b, v7.16b, v5.16b
> + sub v16.4s, v16.4s, v5.4s
> + sqxtn v5.4h, v16.4s
> + st1 {v5.4h}, [x3], #8
> +
> + subs w6, w6, #1
> + b.ne .loop_quant
> +
> + addv s4, v4.4s
> + mov w9, v4.s[0]
> + add w0, w10, w9
> + ret
> +endfunc
> +
> +.macro satd_4x4_neon
> + ld1 {v1.s}[0], [x2], x3
> + ld1 {v0.s}[0], [x0], x1
> + ld1 {v3.s}[0], [x2], x3
> + ld1 {v2.s}[0], [x0], x1
> +
> + ld1 {v1.s}[1], [x2], x3
> + ld1 {v0.s}[1], [x0], x1
> + ld1 {v3.s}[1], [x2], x3
> + ld1 {v2.s}[1], [x0], x1
> +
> + usubl v4.8h, v0.8b, v1.8b
> + usubl v5.8h, v2.8b, v3.8b
> +
> + add v6.8h, v4.8h, v5.8h
> + sub v7.8h, v4.8h, v5.8h
> +
> + mov v4.d[0], v6.d[1]
> + add v0.8h, v6.8h, v4.8h
> + sub v2.8h, v6.8h, v4.8h
> +
> + mov v5.d[0], v7.d[1]
> + add v1.8h, v7.8h, v5.8h
> + sub v3.8h, v7.8h, v5.8h
> +
> + trn1 v4.4h, v0.4h, v1.4h
> + trn2 v5.4h, v0.4h, v1.4h
> +
> + trn1 v6.4h, v2.4h, v3.4h
> + trn2 v7.4h, v2.4h, v3.4h
> +
> + add v0.4h, v4.4h, v5.4h
> + sub v1.4h, v4.4h, v5.4h
> +
> + add v2.4h, v6.4h, v7.4h
> + sub v3.4h, v6.4h, v7.4h
> +
> + trn1 v4.2s, v0.2s, v1.2s
> + trn2 v5.2s, v0.2s, v1.2s
> +
> + trn1 v6.2s, v2.2s, v3.2s
> + trn2 v7.2s, v2.2s, v3.2s
> +
> + abs v4.4h, v4.4h
> + abs v5.4h, v5.4h
> + abs v6.4h, v6.4h
> + abs v7.4h, v7.4h
> +
> + smax v1.4h, v4.4h, v5.4h
> + smax v2.4h, v6.4h, v7.4h
> +
> + add v0.4h, v1.4h, v2.4h
> + uaddlp v0.2s, v0.4h
> + uaddlp v0.1d, v0.2s
> +.endm
> +
> +// int satd_4x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
> pix2, intptr_t stride_pix2)
> +function x265_pixel_satd_4x4_neon
> + satd_4x4_neon
> + umov x0, v0.d[0]
> + ret
> +endfunc
> +
> +// int satd_8x4(const pixel* pix1, intptr_t stride_pix1, const pixel*
> pix2, intptr_t stride_pix2)
> +function x265_pixel_satd_8x4_neon
> + mov x4, x0
> + mov x5, x2
> + satd_4x4_neon
> + add x0, x4, #4
> + add x2, x5, #4
> + umov x6, v0.d[0]
> + satd_4x4_neon
> + umov x0, v0.d[0]
> + add x0, x0, x6
> + ret
> +endfunc
> diff --git a/source/common/aarch64/pixel-util.h
> b/source/common/aarch64/pixel-util.h
> new file mode 100644
> index 000000000..043488468
> --- /dev/null
> +++ b/source/common/aarch64/pixel-util.h
> @@ -0,0 +1,40 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Yimeng Su <yimeng.su at huawei.com>
> + * Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_PIXEL_UTIL_AARCH64_H
> +#define X265_PIXEL_UTIL_AARCH64_H
> +
> +int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t stride_pix2);
> +
> +uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
> int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> +int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const
> pixel* recon, intptr_t rstride);
> +
> +#endif // ifndef X265_PIXEL_UTIL_AARCH64_H
> diff --git a/source/common/aarch64/pixel.h b/source/common/aarch64/pixel.h
> new file mode 100644
> index 000000000..179c2f4ec
> --- /dev/null
> +++ b/source/common/aarch64/pixel.h
> @@ -0,0 +1,105 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#ifndef X265_I386_PIXEL_AARCH64_H
> +#define X265_I386_PIXEL_AARCH64_H
> +
> +void x265_pixel_avg_pp_4x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_4x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_4x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_8x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_12x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x4_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x12_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_16x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_24x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x8_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x24_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_32x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_48x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x16_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x32_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x48_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +void x265_pixel_avg_pp_64x64_neon (pixel* dst, intptr_t dstride, const
> pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int);
> +
> +void x265_sad_x3_4x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_4x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_4x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_8x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_12x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x12_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_16x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_24x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x24_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_32x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_48x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x48_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +void x265_sad_x3_64x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, intptr_t frefstride, int32_t* res);
> +
> +void x265_sad_x4_4x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_4x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_4x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_8x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_12x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x4_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x12_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_16x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_24x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x8_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x24_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_32x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_48x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x16_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x32_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const
> pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride,
> int32_t* res);
> +
> +#endif // ifndef X265_I386_PIXEL_AARCH64_H
> diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
> new file mode 100644
> index 000000000..c27cce5ce
> --- /dev/null
> +++ b/source/common/aarch64/sad-a.S
> @@ -0,0 +1,105 @@
>
> +/*****************************************************************************
> + * Copyright (C) 2020 MulticoreWare, Inc
> + *
> + * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at license @ x265.com.
> +
> *****************************************************************************/
> +
> +#include "asm.S"
> +
> +.section .rodata
> +
> +.align 4
> +
> +.text
> +
> +.macro SAD_X_START_8 x
> + ld1 {v0.8b}, [x0], x9
> +.if \x == 3
> + ld1 {v1.8b}, [x1], x4
> + ld1 {v2.8b}, [x2], x4
> + ld1 {v3.8b}, [x3], x4
> +.elseif \x == 4
> + ld1 {v1.8b}, [x1], x5
> + ld1 {v2.8b}, [x2], x5
> + ld1 {v3.8b}, [x3], x5
> + ld1 {v4.8b}, [x4], x5
> +.endif
> + uabdl v16.8h, v0.8b, v1.8b
> + uabdl v17.8h, v0.8b, v2.8b
> + uabdl v18.8h, v0.8b, v3.8b
> +.if \x == 4
> + uabdl v19.8h, v0.8b, v4.8b
> +.endif
> +.endm
> +
> +.macro SAD_X_8 x
> + ld1 {v0.8b}, [x0], x9
> +.if \x == 3
> + ld1 {v1.8b}, [x1], x4
> + ld1 {v2.8b}, [x2], x4
> + ld1 {v3.8b}, [x3], x4
> +.elseif \x == 4
> + ld1 {v1.8b}, [x1], x5
> + ld1 {v2.8b}, [x2], x5
> + ld1 {v3.8b}, [x3], x5
> + ld1 {v4.8b}, [x4], x5
> +.endif
> + uabal v16.8h, v0.8b, v1.8b
> + uabal v17.8h, v0.8b, v2.8b
> + uabal v18.8h, v0.8b, v3.8b
> +.if \x == 4
> + uabal v19.8h, v0.8b, v4.8b
> +.endif
> +.endm
> +
> +.macro SAD_X_8xN x, h
> +function x265_sad_x\x\()_8x\h\()_neon
> + mov x9, #FENC_STRIDE
> + SAD_X_START_8 \x
> +.rept \h - 1
> + SAD_X_8 \x
> +.endr
> + uaddlv s0, v16.8h
> + uaddlv s1, v17.8h
> + uaddlv s2, v18.8h
> +.if \x == 4
> + uaddlv s3, v19.8h
> +.endif
> +
> +.if \x == 3
> + stp s0, s1, [x5]
> + str s2, [x5, #8]
> +.elseif \x == 4
> + stp s0, s1, [x6]
> + stp s2, s3, [x6, #8]
> +.endif
> + ret
> +endfunc
> +.endm
> +
> +SAD_X_8xN 3 4
> +SAD_X_8xN 3 8
> +SAD_X_8xN 3 16
> +SAD_X_8xN 3 32
> +
> +SAD_X_8xN 4 4
> +SAD_X_8xN 4 8
> +SAD_X_8xN 4 16
> +SAD_X_8xN 4 32
> --
> 2.21.0.windows.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Regards,
*Aruna Matheswaran,*
Video Codec Engineer,
Media & AI analytics BU,
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20200319/44a0be4f/attachment-0001.html>
More information about the x265-devel
mailing list