[x265] [PATCH] Add aarch64 support - Part 2

Gopi Satykrishna Akisetty gopi.satykrishna at multicorewareinc.com
Tue Mar 17 17:08:58 CET 2020


On Tue, Mar 17, 2020 at 2:59 PM Suyimeng <yimeng.su at huawei.com> wrote:

>
>
> *From:* x265-devel [mailto:x265-devel-bounces at videolan.org] *On Behalf Of
> *Gopi Satykrishna Akisetty
> *Sent:* Tuesday, March 17, 2020 4:53 PM
> *To:* Development for x265 <x265-devel at videolan.org>
> *Subject:* Re: [x265] [PATCH] Add aarch64 support - Part 2
>
>
>
>
>
>
>
> On Tue, Mar 17, 2020 at 9:50 AM Suyimeng <yimeng.su at huawei.com> wrote:
>
>
>
> *From:* x265-devel [mailto:x265-devel-bounces at videolan.org] *On Behalf Of
> *Gopi Satykrishna Akisetty
> *Sent:* Monday, March 16, 2020 9:37 PM
> *To:* Development for x265 <x265-devel at videolan.org>
> *Subject:* Re: [x265] [PATCH] Add aarch64 support - Part 2
>
>
>
>
>
>
>
> On Thu, Feb 27, 2020 at 8:04 AM Xiyuan Wang <wangxiyuan1007 at gmail.com>
> wrote:
>
> From: wangxiyuan <wangxiyuan at huawei.com>
>
>
> This patch adds aarch64 build & compile support. This patch must be
> merged after the patch Part 1.
> ---
>  build/aarch64-linux/crosscompile.cmake  |  15 ++
>  build/aarch64-linux/make-Makefiles.bash |   4 +
>  source/CMakeLists.txt                   |  38 +++-
>  source/common/CMakeLists.txt            |  35 ++-
>  source/common/arm/asm-primitives.cpp    | 291 ++++++++++++------------
>  source/common/cpu.cpp                   |   4 +
>  source/common/pixel.cpp                 |   9 +
>  source/common/primitives.h              |  11 +
>  source/test/CMakeLists.txt              |  16 +-
>  source/test/testbench.cpp               |  16 ++
>  source/test/testharness.h               |   5 +
>  11 files changed, 274 insertions(+), 170 deletions(-)
>  create mode 100644 build/aarch64-linux/crosscompile.cmake
>  create mode 100644 build/aarch64-linux/make-Makefiles.bash
>
> diff --git a/build/aarch64-linux/crosscompile.cmake
> b/build/aarch64-linux/crosscompile.cmake
> new file mode 100644
> index 000000000..41c8217f2
> --- /dev/null
> +++ b/build/aarch64-linux/crosscompile.cmake
> @@ -0,0 +1,15 @@
> +# CMake toolchain file for cross compiling x265 for aarch64
> +# This feature is only supported as experimental. Use with caution.
> +# Please report bugs on bitbucket
> +# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G
> "Unix Makefiles" ../../source && ccmake ../../source
> +
> +set(CROSS_COMPILE_ARM 1)
> +set(CMAKE_SYSTEM_NAME Linux)
> +set(CMAKE_SYSTEM_PROCESSOR aarch64)
> +
> +# specify the cross compiler
> +set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
> +set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
> +
> +# specify the target environment
> +SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
> diff --git a/build/aarch64-linux/make-Makefiles.bash
> b/build/aarch64-linux/make-Makefiles.bash
> new file mode 100644
> index 000000000..c9582da0a
> --- /dev/null
> +++ b/build/aarch64-linux/make-Makefiles.bash
> @@ -0,0 +1,4 @@
> +#!/bin/bash
> +# Run this from within a bash shell
> +
> +cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles"
> ../../source && ccmake ../../source
> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
> index 5d2474d97..7734eafbb 100644
> --- a/source/CMakeLists.txt
> +++ b/source/CMakeLists.txt
> @@ -40,7 +40,7 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake"
> "${CMAKE_MODULE_PATH}")
>  # System architecture detection
>  string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
>  set(X86_ALIASES x86 i386 i686 x86_64 amd64)
> -set(ARM_ALIASES armv6l armv7l)
> +set(ARM_ALIASES armv6l armv7l aarch64)
>  list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
>  list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)
>  set(POWER_ALIASES ppc64 ppc64le)
> @@ -70,9 +70,15 @@ elseif(ARMMATCH GREATER "-1")
>      else()
>          set(CROSS_COMPILE_ARM 0)
>      endif()
> -    message(STATUS "Detected ARM target processor")
>      set(ARM 1)
> -    add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)
> +    if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)
> +        message(STATUS "Detected ARM64 target processor")
> +        set(ARM64 1)
> +        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1
> -DHAVE_ARMV6=0)
> +    else()
> +        message(STATUS "Detected ARM target processor")
> +        add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0
> -DHAVE_ARMV6=1)
> +    endif()
>  else()
>      message(STATUS "CMAKE_SYSTEM_PROCESSOR value
> `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
>      message(STATUS "Please add this value near
> ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
> @@ -231,14 +237,24 @@ if(GCC)
>          endif()
>      endif()
>      if(ARM AND CROSS_COMPILE_ARM)
> -        set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)
> +        if(ARM64)
> +            set(ARM_ARGS -fPIC)
> +        else()
> +            set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm
> -fPIC)
> +        endif()
> +        message(STATUS "cross compile arm")
>      elseif(ARM)
> -        find_package(Neon)
> -        if(CPU_HAS_NEON)
> -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm
> -fPIC)
> +        if(ARM64)
> +            set(ARM_ARGS -fPIC)
>              add_definitions(-DHAVE_NEON)
>          else()
> -            set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)
> +            find_package(Neon)
> +            if(CPU_HAS_NEON)
> +                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon
> -marm -fPIC)
> +                add_definitions(-DHAVE_NEON)
> +            else()
> +                set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp
> -marm)
> +            endif()
>          endif()
>      endif()
>      add_definitions(${ARM_ARGS})
> @@ -518,7 +534,11 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
>      # compile ARM arch asm files here
>          enable_language(ASM)
>          foreach(ASM ${ARM_ASMS})
> -            set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
> +            if(ARM64)
> +                set(ASM_SRC
> ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
> +            else()
> +                set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})
> +            endif()
>              list(APPEND ASM_SRCS ${ASM_SRC})
>              list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
>              add_custom_command(
> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
> index c70bb108c..c021e603e 100644
> --- a/source/common/CMakeLists.txt
> +++ b/source/common/CMakeLists.txt
> @@ -14,7 +14,7 @@ if(EXTRA_LIB)
>  endif(EXTRA_LIB)
>
>  if(ENABLE_ASSEMBLY)
> -    set_source_files_properties(threading.cpp primitives.cpp PROPERTIES
> COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
> +    set_source_files_properties(threading.cpp primitives.cpp pixel.cpp
> PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
>      list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")
>  endif(ENABLE_ASSEMBLY)
>
> @@ -84,16 +84,33 @@ if(ENABLE_ASSEMBLY AND X86)
>  endif(ENABLE_ASSEMBLY AND X86)
>
>  if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
> -    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h
> dct8.h loopfilter.h)
> +    if(ARM64)
> +        if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))
> +            message(STATUS "Detected CXX compiler using -O3 optimization
> level")
> +            add_definitions(-DAUTO_VECTORIZE=1)
> +        endif()
> +        set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)
>
> -    # add ARM assembly/intrinsic files here
> -    set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S
> blockcopy8.S ipfilter8.S dct-a.S)
> -    set(VEC_PRIMITIVES)
> +        # add ARM assembly/intrinsic files here
> +        set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)
> +        set(VEC_PRIMITIVES)
>
> -    set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
> -    foreach(SRC ${C_SRCS})
> -        set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
> -    endforeach()
> +        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
> +        foreach(SRC ${C_SRCS})
> +            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
> +        endforeach()
> +    else()
> +        set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h
> blockcopy8.h dct8.h loopfilter.h)
> +
> +        # add ARM assembly/intrinsic files here
> +        set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S
> blockcopy8.S ipfilter8.S dct-a.S)
> +        set(VEC_PRIMITIVES)
> +
> +        set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
> +        foreach(SRC ${C_SRCS})
> +            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})
> +        endforeach()
> +    endif()
>      source_group(Assembly FILES ${ASM_PRIMITIVES})
>  endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))
>
> diff --git a/source/common/arm/asm-primitives.cpp
> b/source/common/arm/asm-primitives.cpp
> index 422217845..7f11503f9 100644
> --- a/source/common/arm/asm-primitives.cpp
> +++ b/source/common/arm/asm-primitives.cpp
> @@ -5,6 +5,7 @@
>   *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>   *          Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
>   *          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> + *          Hongbin Liu<liuhongbin1 at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -48,77 +49,77 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int
> cpuMask)
>          p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
>
>          // addAvg
> -         p.pu[LUMA_4x4].addAvg   = PFX(addAvg_4x4_neon);
> -         p.pu[LUMA_4x8].addAvg   = PFX(addAvg_4x8_neon);
> -         p.pu[LUMA_4x16].addAvg  = PFX(addAvg_4x16_neon);
> -         p.pu[LUMA_8x4].addAvg   = PFX(addAvg_8x4_neon);
> -         p.pu[LUMA_8x8].addAvg   = PFX(addAvg_8x8_neon);
> -         p.pu[LUMA_8x16].addAvg  = PFX(addAvg_8x16_neon);
> -         p.pu[LUMA_8x32].addAvg  = PFX(addAvg_8x32_neon);
> -         p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);
> -         p.pu[LUMA_16x4].addAvg  = PFX(addAvg_16x4_neon);
> -         p.pu[LUMA_16x8].addAvg  = PFX(addAvg_16x8_neon);
> -         p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);
> -         p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);
> -         p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);
> -         p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);
> -         p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);
> -         p.pu[LUMA_32x8].addAvg  = PFX(addAvg_32x8_neon);
> -         p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);
> -         p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);
> -         p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);
> -         p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);
> -         p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);
> -         p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);
> -         p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);
> -         p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);
> -         p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);
> +         p.pu[LUMA_4x4].addAvg[NONALIGNED]   = PFX(addAvg_4x4_neon);
> +         p.pu[LUMA_4x8].addAvg[NONALIGNED]   = PFX(addAvg_4x8_neon);
> +         p.pu[LUMA_4x16].addAvg[NONALIGNED]  = PFX(addAvg_4x16_neon);
> +         p.pu[LUMA_8x4].addAvg[NONALIGNED]   = PFX(addAvg_8x4_neon);
> +         p.pu[LUMA_8x8].addAvg[NONALIGNED]   = PFX(addAvg_8x8_neon);
> +         p.pu[LUMA_8x16].addAvg[NONALIGNED]  = PFX(addAvg_8x16_neon);
> +         p.pu[LUMA_8x32].addAvg[NONALIGNED]  = PFX(addAvg_8x32_neon);
> +         p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);
> +         p.pu[LUMA_16x4].addAvg[NONALIGNED]  = PFX(addAvg_16x4_neon);
> +         p.pu[LUMA_16x8].addAvg[NONALIGNED]  = PFX(addAvg_16x8_neon);
> +         p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);
> +         p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);
> +         p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);
> +         p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);
> +         p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);
> +         p.pu[LUMA_32x8].addAvg[NONALIGNED]  = PFX(addAvg_32x8_neon);
> +         p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);
> +         p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);
> +         p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);
> +         p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);
> +         p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);
> +         p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);
> +         p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);
> +         p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);
> +         p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);
>
>          // chroma addAvg
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg   =
> PFX(addAvg_4x2_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg   =
> PFX(addAvg_4x4_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg   =
> PFX(addAvg_4x8_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg  =
> PFX(addAvg_4x16_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg   =
> PFX(addAvg_6x8_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   =
> PFX(addAvg_8x2_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   =
> PFX(addAvg_8x4_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   =
> PFX(addAvg_8x6_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   =
> PFX(addAvg_8x8_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  =
> PFX(addAvg_8x16_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  =
> PFX(addAvg_8x32_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg =
> PFX(addAvg_12x16_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  =
> PFX(addAvg_16x4_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  =
> PFX(addAvg_16x8_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg =
> PFX(addAvg_16x12_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg =
> PFX(addAvg_16x16_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg =
> PFX(addAvg_16x32_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg =
> PFX(addAvg_24x32_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  =
> PFX(addAvg_32x8_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg =
> PFX(addAvg_32x16_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg =
> PFX(addAvg_32x24_neon);
> -        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg =
> PFX(addAvg_32x32_neon);
> -
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg   =
> PFX(addAvg_4x8_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg  =
> PFX(addAvg_4x16_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg  =
> PFX(addAvg_4x32_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg  =
> PFX(addAvg_6x16_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg   =
> PFX(addAvg_8x4_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg   =
> PFX(addAvg_8x8_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg  =
> PFX(addAvg_8x12_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg  =
> PFX(addAvg_8x16_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg  =
> PFX(addAvg_8x32_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg  =
> PFX(addAvg_8x64_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg =
> PFX(addAvg_12x32_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg  =
> PFX(addAvg_16x8_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg =
> PFX(addAvg_16x16_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg =
> PFX(addAvg_16x24_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg =
> PFX(addAvg_16x32_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg =
> PFX(addAvg_16x64_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg =
> PFX(addAvg_24x64_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg =
> PFX(addAvg_32x16_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg =
> PFX(addAvg_32x32_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg =
> PFX(addAvg_32x48_neon);
> -        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg =
> PFX(addAvg_32x64_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]   =
> PFX(addAvg_4x2_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]   =
> PFX(addAvg_4x4_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]   =
> PFX(addAvg_4x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]  =
> PFX(addAvg_4x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]   =
> PFX(addAvg_6x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]   =
> PFX(addAvg_8x2_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]   =
> PFX(addAvg_8x4_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]   =
> PFX(addAvg_8x6_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]   =
> PFX(addAvg_8x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]  =
> PFX(addAvg_8x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]  =
> PFX(addAvg_8x32_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] =
> PFX(addAvg_12x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]  =
> PFX(addAvg_16x4_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]  =
> PFX(addAvg_16x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] =
> PFX(addAvg_16x12_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] =
> PFX(addAvg_16x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] =
> PFX(addAvg_16x32_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] =
> PFX(addAvg_24x32_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]  =
> PFX(addAvg_32x8_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] =
> PFX(addAvg_32x16_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] =
> PFX(addAvg_32x24_neon);
> +        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] =
> PFX(addAvg_32x32_neon);
> +
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]   =
> PFX(addAvg_4x8_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]  =
> PFX(addAvg_4x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]  =
> PFX(addAvg_4x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]  =
> PFX(addAvg_6x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]   =
> PFX(addAvg_8x4_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]   =
> PFX(addAvg_8x8_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]  =
> PFX(addAvg_8x12_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]  =
> PFX(addAvg_8x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]  =
> PFX(addAvg_8x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]  =
> PFX(addAvg_8x64_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] =
> PFX(addAvg_12x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]  =
> PFX(addAvg_16x8_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] =
> PFX(addAvg_16x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] =
> PFX(addAvg_16x24_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] =
> PFX(addAvg_16x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] =
> PFX(addAvg_16x64_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] =
> PFX(addAvg_24x64_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] =
> PFX(addAvg_32x16_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] =
> PFX(addAvg_32x32_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] =
> PFX(addAvg_32x48_neon);
> +        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] =
> PFX(addAvg_32x64_neon);
>
>          // quant
>           p.quant = PFX(quant_neon);
> @@ -402,7 +403,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int
> cpuMask)
>          p.scale2D_64to32  = PFX(scale2D_64to32_neon);
>
>          // scale1D_128to64
> -        p.scale1D_128to64 = PFX(scale1D_128to64_neon);
> +        p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
>
>          // copy_count
>          p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
> @@ -411,37 +412,37 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
> int cpuMask)
>          p.cu[BLOCK_32x32].copy_cnt   = PFX(copy_cnt_32_neon);
>
>          // filterPixelToShort
> -        p.pu[LUMA_4x4].convert_p2s   = PFX(filterPixelToShort_4x4_neon);
> -        p.pu[LUMA_4x8].convert_p2s   = PFX(filterPixelToShort_4x8_neon);
> -        p.pu[LUMA_4x16].convert_p2s  = PFX(filterPixelToShort_4x16_neon);
> -        p.pu[LUMA_8x4].convert_p2s   = PFX(filterPixelToShort_8x4_neon);
> -        p.pu[LUMA_8x8].convert_p2s   = PFX(filterPixelToShort_8x8_neon);
> -        p.pu[LUMA_8x16].convert_p2s  = PFX(filterPixelToShort_8x16_neon);
> -        p.pu[LUMA_8x32].convert_p2s  = PFX(filterPixelToShort_8x32_neon);
> -        p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);
> -        p.pu[LUMA_16x4].convert_p2s  = PFX(filterPixelToShort_16x4_neon);
> -        p.pu[LUMA_16x8].convert_p2s  = PFX(filterPixelToShort_16x8_neon);
> -        p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);
> -        p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);
> -        p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);
> -        p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);
> -        p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);
> -        p.pu[LUMA_32x8].convert_p2s  = PFX(filterPixelToShort_32x8_neon);
> -        p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);
> -        p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);
> -        p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);
> -        p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);
> -        p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);
> -        p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);
> -        p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);
> -        p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);
> -        p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);
> +        p.pu[LUMA_4x4].convert_p2s[NONALIGNED]   =
> PFX(filterPixelToShort_4x4_neon);
> +        p.pu[LUMA_4x8].convert_p2s[NONALIGNED]   =
> PFX(filterPixelToShort_4x8_neon);
> +        p.pu[LUMA_4x16].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_4x16_neon);
> +        p.pu[LUMA_8x4].convert_p2s[NONALIGNED]   =
> PFX(filterPixelToShort_8x4_neon);
> +        p.pu[LUMA_8x8].convert_p2s[NONALIGNED]   =
> PFX(filterPixelToShort_8x8_neon);
> +        p.pu[LUMA_8x16].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_8x16_neon);
> +        p.pu[LUMA_8x32].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_8x32_neon);
> +        p.pu[LUMA_12x16].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_12x16_neon);
> +        p.pu[LUMA_16x4].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_16x4_neon);
> +        p.pu[LUMA_16x8].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_16x8_neon);
> +        p.pu[LUMA_16x12].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_16x12_neon);
> +        p.pu[LUMA_16x16].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_16x16_neon);
> +        p.pu[LUMA_16x32].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_16x32_neon);
> +        p.pu[LUMA_16x64].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_16x64_neon);
> +        p.pu[LUMA_24x32].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_24x32_neon);
> +        p.pu[LUMA_32x8].convert_p2s[NONALIGNED]  =
> PFX(filterPixelToShort_32x8_neon);
> +        p.pu[LUMA_32x16].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_32x16_neon);
> +        p.pu[LUMA_32x24].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_32x24_neon);
> +        p.pu[LUMA_32x32].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_32x32_neon);
> +        p.pu[LUMA_32x64].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_32x64_neon);
> +        p.pu[LUMA_48x64].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_48x64_neon);
> +        p.pu[LUMA_64x16].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_64x16_neon);
> +        p.pu[LUMA_64x32].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_64x32_neon);
> +        p.pu[LUMA_64x48].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_64x48_neon);
> +        p.pu[LUMA_64x64].convert_p2s[NONALIGNED] =
> PFX(filterPixelToShort_64x64_neon);
>
>          // Block_fill
> -        p.cu[BLOCK_4x4].blockfill_s   = PFX(blockfill_s_4x4_neon);
> -        p.cu[BLOCK_8x8].blockfill_s   = PFX(blockfill_s_8x8_neon);
> -        p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);
> -        p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);
> +        p.cu[BLOCK_4x4].blockfill_s[NONALIGNED]   =
> PFX(blockfill_s_4x4_neon);
> +        p.cu[BLOCK_8x8].blockfill_s[NONALIGNED]   =
> PFX(blockfill_s_8x8_neon);
> +        p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] =
> PFX(blockfill_s_16x16_neon);
> +        p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] =
> PFX(blockfill_s_32x32_neon);
>
>          // Blockcopy_ss
>          p.cu[BLOCK_4x4].copy_ss   = PFX(blockcopy_ss_4x4_neon);
> @@ -495,21 +496,21 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
> int cpuMask)
>          p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp =
> PFX(blockcopy_sp_32x64_neon);
>
>          // pixel_add_ps
> -        p.cu[BLOCK_4x4].add_ps   = PFX(pixel_add_ps_4x4_neon);
> -        p.cu[BLOCK_8x8].add_ps   = PFX(pixel_add_ps_8x8_neon);
> -        p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);
> -        p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);
> -        p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);
> +        p.cu[BLOCK_4x4].add_ps[NONALIGNED]   =
> PFX(pixel_add_ps_4x4_neon);
> +        p.cu[BLOCK_8x8].add_ps[NONALIGNED]   =
> PFX(pixel_add_ps_8x8_neon);
> +        p.cu[BLOCK_16x16].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_16x16_neon);
> +        p.cu[BLOCK_32x32].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_32x32_neon);
> +        p.cu[BLOCK_64x64].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_64x64_neon);
>
>          // chroma add_ps
> -        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps   =
> PFX(pixel_add_ps_4x4_neon);
> -        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps   =
> PFX(pixel_add_ps_8x8_neon);
> -        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps =
> PFX(pixel_add_ps_16x16_neon);
> -        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps =
> PFX(pixel_add_ps_32x32_neon);
> -        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps   =
> PFX(pixel_add_ps_4x8_neon);
> -        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps  =
> PFX(pixel_add_ps_8x16_neon);
> -        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps =
> PFX(pixel_add_ps_16x32_neon);
> -        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps =
> PFX(pixel_add_ps_32x64_neon);
> +        p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED]   =
> PFX(pixel_add_ps_4x4_neon);
> +        p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED]   =
> PFX(pixel_add_ps_8x8_neon);
> +        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_16x16_neon);
> +        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_32x32_neon);
> +        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED]   =
> PFX(pixel_add_ps_4x8_neon);
> +        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED]  =
> PFX(pixel_add_ps_8x16_neon);
> +        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_16x32_neon);
> +        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] =
> PFX(pixel_add_ps_32x64_neon);
>
>          // cpy2Dto1D_shr
>          p.cu[BLOCK_4x4].cpy2Dto1D_shr   = PFX(cpy2Dto1D_shr_4x4_neon);
> @@ -518,10 +519,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
> int cpuMask)
>          p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
>
>          // ssd_s
> -        p.cu[BLOCK_4x4].ssd_s   = PFX(pixel_ssd_s_4x4_neon);
> -        p.cu[BLOCK_8x8].ssd_s   = PFX(pixel_ssd_s_8x8_neon);
> -        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
> -        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
> +        p.cu[BLOCK_4x4].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_4x4_neon);
> +        p.cu[BLOCK_8x8].ssd_s[NONALIGNED]   = PFX(pixel_ssd_s_8x8_neon);
> +        p.cu[BLOCK_16x16].ssd_s[NONALIGNED] =
> PFX(pixel_ssd_s_16x16_neon);
> +        p.cu[BLOCK_32x32].ssd_s[NONALIGNED] =
> PFX(pixel_ssd_s_32x32_neon);
>
>          // sse_ss
>          p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_neon);
> @@ -548,10 +549,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
> int cpuMask)
>          p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps =
> PFX(pixel_sub_ps_32x64_neon);
>
>          // calc_Residual
> -        p.cu[BLOCK_4x4].calcresidual   = PFX(getResidual4_neon);
> -        p.cu[BLOCK_8x8].calcresidual   = PFX(getResidual8_neon);
> -        p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);
> -        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);
> +        p.cu[BLOCK_4x4].calcresidual[NONALIGNED]   =
> PFX(getResidual4_neon);
> +        p.cu[BLOCK_8x8].calcresidual[NONALIGNED]   =
> PFX(getResidual8_neon);
> +        p.cu[BLOCK_16x16].calcresidual[NONALIGNED] =
> PFX(getResidual16_neon);
> +        p.cu[BLOCK_32x32].calcresidual[NONALIGNED] =
> PFX(getResidual32_neon);
>
>          // sse_pp
>          p.cu[BLOCK_4x4].sse_pp   = PFX(pixel_sse_pp_4x4_neon);
> @@ -722,31 +723,31 @@ void setupAssemblyPrimitives(EncoderPrimitives &p,
> int cpuMask)
>          p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);
>
>          // pixel_avg_pp
> -        p.pu[LUMA_4x4].pixelavg_pp   = PFX(pixel_avg_pp_4x4_neon);
> -        p.pu[LUMA_4x8].pixelavg_pp   = PFX(pixel_avg_pp_4x8_neon);
> -        p.pu[LUMA_4x16].pixelavg_pp  = PFX(pixel_avg_pp_4x16_neon);
> -        p.pu[LUMA_8x4].pixelavg_pp   = PFX(pixel_avg_pp_8x4_neon);
> -        p.pu[LUMA_8x8].pixelavg_pp   = PFX(pixel_avg_pp_8x8_neon);
> -        p.pu[LUMA_8x16].pixelavg_pp  = PFX(pixel_avg_pp_8x16_neon);
> -        p.pu[LUMA_8x32].pixelavg_pp  = PFX(pixel_avg_pp_8x32_neon);
> -        p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);
> -        p.pu[LUMA_16x4].pixelavg_pp  = PFX(pixel_avg_pp_16x4_neon);
> -        p.pu[LUMA_16x8].pixelavg_pp  = PFX(pixel_avg_pp_16x8_neon);
> -        p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);
> -        p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);
> -        p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);
> -        p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);
> -        p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);
> -        p.pu[LUMA_32x8].pixelavg_pp  = PFX(pixel_avg_pp_32x8_neon);
> -        p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);
> -        p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);
> -        p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);
> -        p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);
> -        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);
> -        p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);
> -        p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);
> -        p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);
> -        p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);
> +        p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_4x4_neon);
> +        p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_4x8_neon);
> +        p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_4x16_neon);
> +        p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_8x4_neon);
> +        p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]   =
> PFX(pixel_avg_pp_8x8_neon);
> +        p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_8x16_neon);
> +        p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_8x32_neon);
> +        p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_12x16_neon);
> +        p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_16x4_neon);
> +        p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_16x8_neon);
> +        p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_16x12_neon);
> +        p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_16x16_neon);
> +        p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_16x32_neon);
> +        p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_16x64_neon);
> +        p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_24x32_neon);
> +        p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED]  =
> PFX(pixel_avg_pp_32x8_neon);
> +        p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_32x16_neon);
> +        p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_32x24_neon);
> +        p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_32x32_neon);
> +        p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_32x64_neon);
> +        p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_48x64_neon);
> +        p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_64x16_neon);
> +        p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_64x32_neon);
> +        p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_64x48_neon);
> +        p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] =
> PFX(pixel_avg_pp_64x64_neon);
>
>          // planecopy
>          p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp
> index 26c82ea50..2eacfe4a9 100644
> --- a/source/common/cpu.cpp
> +++ b/source/common/cpu.cpp
> @@ -5,6 +5,8 @@
>   *          Laurent Aimar <fenrir at via.ecp.fr>
>   *          Fiona Glaser <fiona at x264.com>
>   *          Steve Borho <steve at borho.org>
> + *          Hongbin Liu <liuhongbin1 at huawei.com>
> + *          Yimeng Su <yimeng.su at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -367,6 +369,8 @@ uint32_t cpu_detect(bool benableavx512)
>      flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;
>  #endif
>      // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9
> (fast mrc)
> +#elif X265_ARCH_ARM64
> +    flags |= X265_CPU_NEON;
>  #endif // if HAVE_ARMV6
>      return flags;
>  }
> diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
> index 99b84449c..e4f890cd5 100644
> --- a/source/common/pixel.cpp
> +++ b/source/common/pixel.cpp
> @@ -5,6 +5,7 @@
>   *          Mandar Gurav <mandar at multicorewareinc.com>
>   *          Mahesh Pittala <mahesh at multicorewareinc.com>
>   *          Min Chen <min.chen at multicorewareinc.com>
> + *          Hongbin Liu<liuhongbin1 at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -265,6 +266,10 @@ int satd4(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t s
>  {
>      int satd = 0;
>
> +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
> +    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
> +#endif
>
> is there any specific reason why the above code is added?? is this a kind
> of a temporary fix for the output mismatch between c and asm code?
>
> No, c and asm output is matched. Currently we only complete partial satd
> primatives. This is a workaround that improve all satd primitives with asm
> code. Maybe there is a bad code style.
>
> If I understand correctly, you are trying to use a combination of c and
> asm code for all other kernel sizes that you have not completed asm
> implementation yet?
>
> Yes, you are right.
>
ok. If this code block is going to be removed in the future patches, where
you will be implementing the asm for remaining satd  kernels, then this
patch is good to be pushed.

>
>
> +
>      for (int row = 0; row < h; row += 4)
>          for (int col = 0; col < w; col += 4)
>              satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
> @@ -279,6 +284,10 @@ int satd8(const pixel* pix1, intptr_t stride_pix1,
> const pixel* pix2, intptr_t s
>  {
>      int satd = 0;
>
> +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
> +    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
> +#endif
> +
>
> Same comment as above.
>
> Same response.
>
>      for (int row = 0; row < h; row += 4)
>          for (int col = 0; col < w; col += 8)
>              satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
> diff --git a/source/common/primitives.h b/source/common/primitives.h
> index 5c64952fb..0b52f84de 100644
> --- a/source/common/primitives.h
> +++ b/source/common/primitives.h
> @@ -8,6 +8,8 @@
>   *          Rajesh Paulraj <rajesh at multicorewareinc.com>
>   *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>   *          Min Chen <chenm003 at 163.com>
> + *          Hongbin Liu<liuhongbin1 at huawei.com>
> + *          Yimeng Su <yimeng.su at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -467,6 +469,9 @@ void setupCPrimitives(EncoderPrimitives &p);
>  void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
>  void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
>  void setupAliasPrimitives(EncoderPrimitives &p);
> +#if X265_ARCH_ARM64
> +void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives
> &asmp, int cpuMask);
> +#endif
>  #if HAVE_ALTIVEC
>  void setupPixelPrimitives_altivec(EncoderPrimitives &p);
>  void setupDCTPrimitives_altivec(EncoderPrimitives &p);
> @@ -481,4 +486,10 @@ extern const char* PFX(version_str);
>  extern const char* PFX(build_info_str);
>  #endif
>
> +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64
> +extern "C" {
> +#include "aarch64/pixel-util.h"
> +}
> +#endif
> +
>  #endif // ifndef X265_PRIMITIVES_H
> diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt
> index 260195f53..9abaf31ff 100644
> --- a/source/test/CMakeLists.txt
> +++ b/source/test/CMakeLists.txt
> @@ -23,13 +23,15 @@ endif(X86)
>
>  # add ARM assembly files
>  if(ARM OR CROSS_COMPILE_ARM)
> -    enable_language(ASM)
> -    set(NASM_SRC checkasm-arm.S)
> -    add_custom_command(
> -        OUTPUT checkasm-arm.obj
> -        COMMAND ${CMAKE_CXX_COMPILER}
> -        ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o
> checkasm-arm.obj
> -        DEPENDS checkasm-arm.S)
> +    if(NOT ARM64)
> +        enable_language(ASM)
> +        set(NASM_SRC checkasm-arm.S)
> +        add_custom_command(
> +            OUTPUT checkasm-arm.obj
> +            COMMAND ${CMAKE_CXX_COMPILER}
> +            ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S
> -o checkasm-arm.obj
> +            DEPENDS checkasm-arm.S)
> +    endif()
>  endif(ARM OR CROSS_COMPILE_ARM)
>
>  # add PowerPC assembly files
> diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
> index ac14f9710..8db8c0c25 100644
> --- a/source/test/testbench.cpp
> +++ b/source/test/testbench.cpp
> @@ -5,6 +5,7 @@
>   *          Mandar Gurav <mandar at multicorewareinc.com>
>   *          Mahesh Pittala <mahesh at multicorewareinc.com>
>   *          Min Chen <chenm003 at 163.com>
> + *          Yimeng Su <yimeng.su at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -208,6 +209,14 @@ int main(int argc, char *argv[])
>          EncoderPrimitives asmprim;
>          memset(&asmprim, 0, sizeof(asmprim));
>          setupAssemblyPrimitives(asmprim, test_arch[i].flag);
> +
> +#if X265_ARCH_ARM64
> +        /* Temporary workaround because luma_vsp assembly primitive has
> not been completed
> +         * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly
> primitive.
> +         * Otherwise, segment fault occurs. */
> +        setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);
> +#endif
> +
>          setupAliasPrimitives(asmprim);
>          memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));
>          for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*);
> h++)
> @@ -232,6 +241,13 @@ int main(int argc, char *argv[])
>  #endif
>      setupAssemblyPrimitives(optprim, cpuid);
>
> +#if X265_ARCH_ARM64
> +    /* Temporary workaround because luma_vsp assembly primitive has not
> been completed
> +     * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly
> primitive.
> +     * Otherwise, segment fault occurs. */
> +    setupAliasCPrimitives(cprim, optprim, cpuid);
> +#endif
> +
>      /* Note that we do not setup aliases for performance tests, that
> would be
>       * redundant. The testbench only verifies they are correctly aliased
> */
>
> diff --git a/source/test/testharness.h b/source/test/testharness.h
> index 771551583..6e680953f 100644
> --- a/source/test/testharness.h
> +++ b/source/test/testharness.h
> @@ -3,6 +3,7 @@
>   *
>   * Authors: Steve Borho <steve at borho.org>
>   *          Min Chen <chenm003 at 163.com>
> + *          Yimeng Su <yimeng.su at huawei.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -81,11 +82,15 @@ static inline uint32_t __rdtsc(void)
>  #if X265_ARCH_X86
>      asm volatile("rdtsc" : "=a" (a) ::"edx");
>  #elif X265_ARCH_ARM
> +#if X265_ARCH_ARM64
> +    asm volatile("mrs %0, cntvct_el0" : "=r"(a));
> +#else
>      // TOD-DO: verify following inline asm to get cpu Timestamp Counter
> for ARM arch
>      // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));
>
>      // TO-DO: replace clock() function with appropriate ARM cpu
> instructions
>      a = clock();
> +#endif
>  #endif
>      return a;
>  }
> --
> 2.21.0.windows.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20200317/792ba2dd/attachment-0001.html>


More information about the x265-devel mailing list