<div dir="ltr"><div dir="ltr"><br></div><br><div class="gmail_quote"><div class="gmail_attr" dir="ltr">On Tue, Mar 17, 2020 at 9:50 AM Suyimeng <<a href="mailto:yimeng.su@huawei.com">yimeng.su@huawei.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;padding-left:1ex;border-left-color:rgb(204,204,204);border-left-width:1px;border-left-style:solid">
<div lang="ZH-CN">
<div class="gmail-m_3550197732202863311WordSection1">
<p class="MsoNormal"><span lang="EN-US" style="color:rgb(31,73,125);font-family:"Calibri",sans-serif;font-size:10.5pt"><u></u>Â <u></u></span></p>
<p class="MsoNormal"><b><span lang="EN-US" style="font-family:"Calibri",sans-serif;font-size:11pt">From:</span></b><span lang="EN-US" style="font-family:"Calibri",sans-serif;font-size:11pt"> x265-devel [mailto:<a href="mailto:x265-devel-bounces@videolan.org" target="_blank">x265-devel-bounces@videolan.org</a>]
<b>On Behalf Of </b>Gopi Satykrishna Akisetty<br>
<b>Sent:</b> Monday, March 16, 2020 9:37 PM<br>
<b>To:</b> Development for x265 <<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a>><br>
<b>Subject:</b> Re: [x265] [PATCH] Add aarch64 support - Part 2<u></u><u></u></span></p>
<p class="MsoNormal"><span lang="EN-US"><u></u>Â <u></u></span></p>
<div>
<div>
<p class="MsoNormal"><span lang="EN-US"><u></u>Â <u></u></span></p>
</div>
<p class="MsoNormal"><span lang="EN-US"><u></u>Â <u></u></span></p>
<div>
<div>
<p class="MsoNormal"><span lang="EN-US">On Thu, Feb 27, 2020 at 8:04 AM Xiyuan Wang <<a href="mailto:wangxiyuan1007@gmail.com" target="_blank">wangxiyuan1007@gmail.com</a>> wrote:<u></u><u></u></span></p>
</div>
<blockquote style="border-width:medium medium medium 1pt;border-style:none none none solid;border-color:currentColor currentColor currentColor rgb(204,204,204);margin:5pt 0cm 5pt 4.8pt;padding:0cm 0cm 0cm 6pt">
<div>
<div>
<div>
<p class="MsoNormal"><span lang="EN-US">From: wangxiyuan <<a href="mailto:wangxiyuan@huawei.com" target="_blank">wangxiyuan@huawei.com</a>><u></u><u></u></span></p>
</div>
<p class="MsoNormal"><span lang="EN-US"><br>
This patch adds aarch64 build & compile support. This patch must be<br>
merged after the patch Part 1.<br>
---<br>
 build/aarch64-linux/crosscompile.cmake | 15 ++<br>
 build/aarch64-linux/make-Makefiles.bash |  4 +<br>
 source/CMakeLists.txt          | 38 +++-<br>
 source/common/CMakeLists.txt      | 35 ++-<br>
 source/common/arm/asm-primitives.cpp  | 291 ++++++++++++------------<br>
 source/common/cpu.cpp          |  4 +<br>
 source/common/pixel.cpp         |  9 +<br>
 source/common/primitives.h       | 11 +<br>
 source/test/CMakeLists.txt       | 16 +-<br>
 source/test/testbench.cpp        | 16 ++<br>
 source/test/testharness.h        |  5 +<br>
 11 files changed, 274 insertions(+), 170 deletions(-)<br>
 create mode 100644 build/aarch64-linux/crosscompile.cmake<br>
 create mode 100644 build/aarch64-linux/make-Makefiles.bash<br>
<br>
diff --git a/build/aarch64-linux/crosscompile.cmake b/build/aarch64-linux/crosscompile.cmake<br>
new file mode 100644<br>
index 000000000..41c8217f2<br>
--- /dev/null<br>
+++ b/build/aarch64-linux/crosscompile.cmake<br>
@@ -0,0 +1,15 @@<br>
+# CMake toolchain file for cross compiling x265 for aarch64<br>
+# This feature is only supported as experimental. Use with caution.<br>
+# Please report bugs on bitbucket<br>
+# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G "Unix Makefiles" ../../source && ccmake ../../source<br>
+<br>
+set(CROSS_COMPILE_ARM 1)<br>
+set(CMAKE_SYSTEM_NAME Linux)<br>
+set(CMAKE_SYSTEM_PROCESSOR aarch64)<br>
+<br>
+# specify the cross compiler<br>
+set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)<br>
+set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)<br>
+<br>
+# specify the target environment<br>
+SET(CMAKE_FIND_ROOT_PATHÂ /usr/aarch64-linux-gnu)<br>
diff --git a/build/aarch64-linux/make-Makefiles.bash b/build/aarch64-linux/make-Makefiles.bash<br>
new file mode 100644<br>
index 000000000..c9582da0a<br>
--- /dev/null<br>
+++ b/build/aarch64-linux/make-Makefiles.bash<br>
@@ -0,0 +1,4 @@<br>
+#!/bin/bash<br>
+# Run this from within a bash shell<br>
+<br>
+cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" ../../source && ccmake ../../source<br>
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt<br>
index 5d2474d97..7734eafbb 100644<br>
--- a/source/CMakeLists.txt<br>
+++ b/source/CMakeLists.txt<br>
@@ -40,7 +40,7 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" "${CMAKE_MODULE_PATH}")<br>
 # System architecture detection<br>
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)<br>
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)<br>
-set(ARM_ALIASES armv6l armv7l)<br>
+set(ARM_ALIASES armv6l armv7l aarch64)<br>
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)<br>
 list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH)<br>
 set(POWER_ALIASES ppc64 ppc64le)<br>
@@ -70,9 +70,15 @@ elseif(ARMMATCH GREATER "-1")<br>
   else()<br>
     set(CROSS_COMPILE_ARM 0)<br>
   endif()<br>
-Â Â message(STATUS "Detected ARM target processor")<br>
   set(ARM 1)<br>
-Â Â add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1)<br>
+Â Â if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8)<br>
+Â Â Â Â message(STATUS "Detected ARM64 target processor")<br>
+Â Â Â Â set(ARM64 1)<br>
+Â Â Â Â add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 -DHAVE_ARMV6=0)<br>
+Â Â else()<br>
+Â Â Â Â message(STATUS "Detected ARM target processor")<br>
+Â Â Â Â add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 -DHAVE_ARMV6=1)<br>
+Â Â endif()<br>
 else()<br>
   message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")<br>
   message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")<br>
@@ -231,14 +237,24 @@ if(GCC)<br>
     endif()<br>
   endif()<br>
   if(ARM AND CROSS_COMPILE_ARM)<br>
-Â Â Â Â set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)<br>
+Â Â Â Â if(ARM64)<br>
+Â Â Â Â Â Â set(ARM_ARGS -fPIC)<br>
+Â Â Â Â else()<br>
+Â Â Â Â Â Â set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC)<br>
+Â Â Â Â endif()<br>
+Â Â Â Â message(STATUS "cross compile arm")<br>
   elseif(ARM)<br>
-Â Â Â Â find_package(Neon)<br>
-Â Â Â Â if(CPU_HAS_NEON)<br>
-Â Â Â Â Â Â set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)<br>
+Â Â Â Â if(ARM64)<br>
+Â Â Â Â Â Â set(ARM_ARGS -fPIC)<br>
       add_definitions(-DHAVE_NEON)<br>
     else()<br>
-Â Â Â Â Â Â set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)<br>
+Â Â Â Â Â Â find_package(Neon)<br>
+Â Â Â Â Â Â if(CPU_HAS_NEON)<br>
+Â Â Â Â Â Â Â Â set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm -fPIC)<br>
+Â Â Â Â Â Â Â Â add_definitions(-DHAVE_NEON)<br>
+Â Â Â Â Â Â else()<br>
+Â Â Â Â Â Â Â Â set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm)<br>
+Â Â Â Â Â Â endif()<br>
     endif()<br>
   endif()<br>
   add_definitions(${ARM_ARGS})<br>
@@ -518,7 +534,11 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)<br>
   # compile ARM arch asm files here<br>
     enable_language(ASM)<br>
     foreach(ASM ${ARM_ASMS})<br>
-Â Â Â Â Â Â set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})<br>
+Â Â Â Â Â Â if(ARM64)<br>
+Â Â Â Â Â Â Â Â set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})<br>
+Â Â Â Â Â Â else()<br>
+Â Â Â Â Â Â Â Â set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM})<br>
+Â Â Â Â Â Â endif()<br>
       list(APPEND ASM_SRCS ${ASM_SRC})<br>
       list(APPEND ASM_OBJS ${ASM}.${SUFFIX})<br>
       add_custom_command(<br>
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt<br>
index c70bb108c..c021e603e 100644<br>
--- a/source/common/CMakeLists.txt<br>
+++ b/source/common/CMakeLists.txt<br>
@@ -14,7 +14,7 @@ if(EXTRA_LIB)<br>
 endif(EXTRA_LIB)<br>
<br>
 if(ENABLE_ASSEMBLY)<br>
-Â Â set_source_files_properties(threading.cpp primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)<br>
+Â Â set_source_files_properties(threading.cpp primitives.cpp pixel.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)<br>
   list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1")<br>
 endif(ENABLE_ASSEMBLY)<br>
<br>
@@ -84,16 +84,33 @@ if(ENABLE_ASSEMBLY AND X86)<br>
 endif(ENABLE_ASSEMBLY AND X86)<br>
<br>
 if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))<br>
-Â Â set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)<br>
+Â Â if(ARM64)<br>
+Â Â Â Â if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3"))<br>
+Â Â Â Â Â Â message(STATUS "Detected CXX compiler using -O3 optimization level")<br>
+Â Â Â Â Â Â add_definitions(-DAUTO_VECTORIZE=1)<br>
+Â Â Â Â endif()<br>
+Â Â Â Â set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h)<br>
<br>
-Â Â # add ARM assembly/intrinsic files here<br>
-Â Â set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)<br>
-Â Â set(VEC_PRIMITIVES)<br>
+Â Â Â Â # add ARM assembly/intrinsic files here<br>
+Â Â Â Â set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S)<br>
+Â Â Â Â set(VEC_PRIMITIVES)<br>
<br>
-Â Â set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")<br>
-Â Â foreach(SRC ${C_SRCS})<br>
-Â Â Â Â set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})<br>
-Â Â endforeach()<br>
+Â Â Â Â set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")<br>
+Â Â Â Â foreach(SRC ${C_SRCS})<br>
+Â Â Â Â Â Â set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})<br>
+Â Â Â Â endforeach()<br>
+Â Â else()<br>
+Â Â Â Â set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)<br>
+<br>
+Â Â Â Â # add ARM assembly/intrinsic files here<br>
+Â Â Â Â set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S blockcopy8.S ipfilter8.S dct-a.S)<br>
+Â Â Â Â set(VEC_PRIMITIVES)<br>
+<br>
+Â Â Â Â set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")<br>
+Â Â Â Â foreach(SRC ${C_SRCS})<br>
+Â Â Â Â Â Â set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC})<br>
+Â Â Â Â endforeach()<br>
+Â Â endif()<br>
   source_group(Assembly FILES ${ASM_PRIMITIVES})<br>
 endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM))<br>
<br>
diff --git a/source/common/arm/asm-primitives.cpp b/source/common/arm/asm-primitives.cpp<br>
index 422217845..7f11503f9 100644<br>
--- a/source/common/arm/asm-primitives.cpp<br>
+++ b/source/common/arm/asm-primitives.cpp<br>
@@ -5,6 +5,7 @@<br>
 *     Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>><br>
 *     Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>> <<a href="mailto:min.chen@multicorewareinc.com" target="_blank">min.chen@multicorewareinc.com</a>><br>
 *     Dnyaneshwar Gorade <<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>><br>
+ *Â Â Â Â Â Hongbin Liu<<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -48,77 +49,77 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);<br>
<br>
     // addAvg<br>
-     p.pu[LUMA_4x4].addAvg  = PFX(addAvg_4x4_neon);<br>
-     p.pu[LUMA_4x8].addAvg  = PFX(addAvg_4x8_neon);<br>
-     p.pu[LUMA_4x16].addAvg = PFX(addAvg_4x16_neon);<br>
-     p.pu[LUMA_8x4].addAvg  = PFX(addAvg_8x4_neon);<br>
-     p.pu[LUMA_8x8].addAvg  = PFX(addAvg_8x8_neon);<br>
-     p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_neon);<br>
-     p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_neon);<br>
-Â Â Â Â Â p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon);<br>
-     p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_neon);<br>
-     p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_neon);<br>
-Â Â Â Â Â p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon);<br>
-Â Â Â Â Â p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon);<br>
-Â Â Â Â Â p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon);<br>
-Â Â Â Â Â p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon);<br>
-Â Â Â Â Â p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon);<br>
-     p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_neon);<br>
-Â Â Â Â Â p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon);<br>
-Â Â Â Â Â p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon);<br>
-Â Â Â Â Â p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon);<br>
-Â Â Â Â Â p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon);<br>
-Â Â Â Â Â p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon);<br>
-Â Â Â Â Â p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon);<br>
-Â Â Â Â Â p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon);<br>
-Â Â Â Â Â p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon);<br>
-Â Â Â Â Â p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon);<br>
+Â Â Â Â Â p.pu[LUMA_4x4].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x4_neon);<br>
+Â Â Â Â Â p.pu[LUMA_4x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x8_neon);<br>
+Â Â Â Â Â p.pu[LUMA_4x16].addAvg[NONALIGNED]Â = PFX(addAvg_4x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_8x4].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x4_neon);<br>
+Â Â Â Â Â p.pu[LUMA_8x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x8_neon);<br>
+Â Â Â Â Â p.pu[LUMA_8x16].addAvg[NONALIGNED]Â = PFX(addAvg_8x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_8x32].addAvg[NONALIGNED]Â = PFX(addAvg_8x32_neon);<br>
+Â Â Â Â Â p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x4].addAvg[NONALIGNED]Â = PFX(addAvg_16x4_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x8].addAvg[NONALIGNED]Â = PFX(addAvg_16x8_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);<br>
+Â Â Â Â Â p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);<br>
+Â Â Â Â Â p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);<br>
+Â Â Â Â Â p.pu[LUMA_32x8].addAvg[NONALIGNED]Â = PFX(addAvg_32x8_neon);<br>
+Â Â Â Â Â p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);<br>
+Â Â Â Â Â p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);<br>
+Â Â Â Â Â p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);<br>
+Â Â Â Â Â p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon);<br>
+Â Â Â Â Â p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon);<br>
+Â Â Â Â Â p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon);<br>
+Â Â Â Â Â p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon);<br>
+Â Â Â Â Â p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon);<br>
<br>
     // chroma addAvg<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg  = PFX(addAvg_4x2_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg  = PFX(addAvg_4x4_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg  = PFX(addAvg_4x8_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg = PFX(addAvg_4x16_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg  = PFX(addAvg_6x8_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg  = PFX(addAvg_8x2_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg  = PFX(addAvg_8x4_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg  = PFX(addAvg_8x6_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg  = PFX(addAvg_8x8_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = PFX(addAvg_8x16_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = PFX(addAvg_8x32_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = PFX(addAvg_12x16_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = PFX(addAvg_16x4_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = PFX(addAvg_16x8_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = PFX(addAvg_16x12_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = PFX(addAvg_16x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = PFX(addAvg_16x32_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = PFX(addAvg_24x32_neon);<br>
-    p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_neon);<br>
-<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg  = PFX(addAvg_4x8_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg = PFX(addAvg_4x16_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg = PFX(addAvg_4x32_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg = PFX(addAvg_6x16_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg  = PFX(addAvg_8x4_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg  = PFX(addAvg_8x8_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = PFX(addAvg_8x12_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = PFX(addAvg_8x16_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = PFX(addAvg_8x32_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = PFX(addAvg_8x64_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = PFX(addAvg_12x32_neon);<br>
-    p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = PFX(addAvg_16x8_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = PFX(addAvg_16x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = PFX(addAvg_16x24_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = PFX(addAvg_16x32_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = PFX(addAvg_16x64_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = PFX(addAvg_24x64_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x2_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x4_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED]Â = PFX(addAvg_4x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_6x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x2_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x4_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x6_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED]Â = PFX(addAvg_8x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED]Â = PFX(addAvg_8x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED]Â = PFX(addAvg_16x4_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED]Â = PFX(addAvg_16x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED]Â = PFX(addAvg_32x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);<br>
+<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_4x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED]Â = PFX(addAvg_4x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED]Â = PFX(addAvg_4x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED]Â = PFX(addAvg_6x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x4_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED]Â Â = PFX(addAvg_8x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED]Â = PFX(addAvg_8x12_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED]Â = PFX(addAvg_8x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED]Â = PFX(addAvg_8x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED]Â = PFX(addAvg_8x64_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = PFX(addAvg_12x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED]Â = PFX(addAvg_16x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = PFX(addAvg_16x24_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = PFX(addAvg_24x64_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = PFX(addAvg_32x48_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon);<br>
<br>
     // quant<br>
     p.quant = PFX(quant_neon);<br>
@@ -402,7 +403,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     p.scale2D_64to32 = PFX(scale2D_64to32_neon);<br>
<br>
     // scale1D_128to64<br>
-Â Â Â Â p.scale1D_128to64 = PFX(scale1D_128to64_neon);<br>
+Â Â Â Â p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);<br>
<br>
     // copy_count<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].copy_cnt   = PFX(copy_cnt_4_neon);<br>
@@ -411,37 +412,37 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].copy_cnt  = PFX(copy_cnt_32_neon);<br>
<br>
     // filterPixelToShort<br>
-    p.pu[LUMA_4x4].convert_p2s  = PFX(filterPixelToShort_4x4_neon);<br>
-    p.pu[LUMA_4x8].convert_p2s  = PFX(filterPixelToShort_4x8_neon);<br>
-    p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon);<br>
-    p.pu[LUMA_8x4].convert_p2s  = PFX(filterPixelToShort_8x4_neon);<br>
-    p.pu[LUMA_8x8].convert_p2s  = PFX(filterPixelToShort_8x8_neon);<br>
-    p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon);<br>
-    p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon);<br>
-Â Â Â Â p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon);<br>
-    p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon);<br>
-    p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon);<br>
-Â Â Â Â p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon);<br>
-Â Â Â Â p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon);<br>
-Â Â Â Â p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon);<br>
-Â Â Â Â p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon);<br>
-Â Â Â Â p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon);<br>
-    p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon);<br>
-Â Â Â Â p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon);<br>
-Â Â Â Â p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon);<br>
-Â Â Â Â p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon);<br>
-Â Â Â Â p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon);<br>
-Â Â Â Â p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon);<br>
-Â Â Â Â p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon);<br>
-Â Â Â Â p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon);<br>
-Â Â Â Â p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon);<br>
-Â Â Â Â p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon);<br>
+Â Â Â Â p.pu[LUMA_4x4].convert_p2s[NONALIGNED]Â Â = PFX(filterPixelToShort_4x4_neon);<br>
+Â Â Â Â p.pu[LUMA_4x8].convert_p2s[NONALIGNED]Â Â = PFX(filterPixelToShort_4x8_neon);<br>
+Â Â Â Â p.pu[LUMA_4x16].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_4x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x4].convert_p2s[NONALIGNED]Â Â = PFX(filterPixelToShort_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].convert_p2s[NONALIGNED]Â Â = PFX(filterPixelToShort_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_8x32_neon);<br>
+Â Â Â Â p.pu[LUMA_12x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_12x16_neon);<br>
+Â Â Â Â p.pu[LUMA_16x4].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_16x4_neon);<br>
+Â Â Â Â p.pu[LUMA_16x8].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_16x8_neon);<br>
+Â Â Â Â p.pu[LUMA_16x12].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x12_neon);<br>
+Â Â Â Â p.pu[LUMA_16x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x16_neon);<br>
+Â Â Â Â p.pu[LUMA_16x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x32_neon);<br>
+Â Â Â Â p.pu[LUMA_16x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_16x64_neon);<br>
+Â Â Â Â p.pu[LUMA_24x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_24x32_neon);<br>
+Â Â Â Â p.pu[LUMA_32x8].convert_p2s[NONALIGNED]Â = PFX(filterPixelToShort_32x8_neon);<br>
+Â Â Â Â p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x16_neon);<br>
+Â Â Â Â p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x24_neon);<br>
+Â Â Â Â p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x32_neon);<br>
+Â Â Â Â p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_32x64_neon);<br>
+Â Â Â Â p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_48x64_neon);<br>
+Â Â Â Â p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x16_neon);<br>
+Â Â Â Â p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x32_neon);<br>
+Â Â Â Â p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x48_neon);<br>
+Â Â Â Â p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = PFX(filterPixelToShort_64x64_neon);<br>
<br>
     // Block_fill<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].blockfill_s  = PFX(blockfill_s_4x4_neon);<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].blockfill_s  = PFX(blockfill_s_8x8_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].blockfill_s[NONALIGNED]Â Â = PFX(blockfill_s_4x4_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].blockfill_s[NONALIGNED]Â Â = PFX(blockfill_s_8x8_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].blockfill_s[NONALIGNED] = PFX(blockfill_s_16x16_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].blockfill_s[NONALIGNED] = PFX(blockfill_s_32x32_neon);<br>
<br>
     // Blockcopy_ss<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].copy_ss  = PFX(blockcopy_ss_4x4_neon);<br>
@@ -495,21 +496,21 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);<br>
<br>
     // pixel_add_ps<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].add_ps  = PFX(pixel_add_ps_4x4_neon);<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].add_ps  = PFX(pixel_add_ps_8x8_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].add_ps[NONALIGNED]Â Â = PFX(pixel_add_ps_4x4_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].add_ps[NONALIGNED]Â Â = PFX(pixel_add_ps_8x8_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);<br>
<br>
     // chroma add_ps<br>
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps  = PFX(pixel_add_ps_4x4_neon);<br>
-    p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps  = PFX(pixel_add_ps_8x8_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = PFX(pixel_add_ps_16x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_neon);<br>
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps  = PFX(pixel_add_ps_4x8_neon);<br>
-    p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps = PFX(pixel_add_ps_8x16_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = PFX(pixel_add_ps_16x32_neon);<br>
-Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED]Â Â = PFX(pixel_add_ps_4x4_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED]Â Â = PFX(pixel_add_ps_8x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED]Â Â = PFX(pixel_add_ps_4x8_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED]Â = PFX(pixel_add_ps_8x16_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);<br>
+Â Â Â Â p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);<br>
<br>
     // cpy2Dto1D_shr<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].cpy2Dto1D_shr  = PFX(cpy2Dto1D_shr_4x4_neon);<br>
@@ -518,10 +519,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);<br>
<br>
     // ssd_s<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].ssd_s  = PFX(pixel_ssd_s_4x4_neon);<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].ssd_s  = PFX(pixel_ssd_s_8x8_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].ssd_s[NONALIGNED]Â Â = PFX(pixel_ssd_s_4x4_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].ssd_s[NONALIGNED]Â Â = PFX(pixel_ssd_s_8x8_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);<br>
<br>
     // sse_ss<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].sse_ss  = PFX(pixel_sse_ss_4x4_neon);<br>
@@ -548,10 +549,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);<br>
<br>
     // calc_Residual<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].calcresidual  = PFX(getResidual4_neon);<br>
-    <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].calcresidual  = PFX(getResidual8_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].calcresidual = PFX(getResidual16_neon);<br>
-Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].calcresidual = PFX(getResidual32_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].calcresidual[NONALIGNED]Â Â = PFX(getResidual4_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_8x8].calcresidual[NONALIGNED]Â Â = PFX(getResidual8_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_neon);<br>
+Â Â Â Â <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_neon);<br>
<br>
     // sse_pp<br>
     <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_4x4].sse_pp  = PFX(pixel_sse_pp_4x4_neon);<br>
@@ -722,31 +723,31 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)<br>
     p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon);<br>
<br>
     // pixel_avg_pp<br>
-    p.pu[LUMA_4x4].pixelavg_pp  = PFX(pixel_avg_pp_4x4_neon);<br>
-    p.pu[LUMA_4x8].pixelavg_pp  = PFX(pixel_avg_pp_4x8_neon);<br>
-    p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_pp_4x16_neon);<br>
-    p.pu[LUMA_8x4].pixelavg_pp  = PFX(pixel_avg_pp_8x4_neon);<br>
-    p.pu[LUMA_8x8].pixelavg_pp  = PFX(pixel_avg_pp_8x8_neon);<br>
-    p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_pp_8x16_neon);<br>
-    p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_pp_8x32_neon);<br>
-Â Â Â Â p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon);<br>
-    p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_pp_16x4_neon);<br>
-    p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_pp_16x8_neon);<br>
-Â Â Â Â p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon);<br>
-Â Â Â Â p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon);<br>
-Â Â Â Â p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon);<br>
-Â Â Â Â p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon);<br>
-Â Â Â Â p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon);<br>
-    p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_pp_32x8_neon);<br>
-Â Â Â Â p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon);<br>
-Â Â Â Â p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon);<br>
-Â Â Â Â p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon);<br>
-Â Â Â Â p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon);<br>
-Â Â Â Â p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon);<br>
-Â Â Â Â p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon);<br>
-Â Â Â Â p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon);<br>
-Â Â Â Â p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon);<br>
-Â Â Â Â p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon);<br>
+Â Â Â Â p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_4x4_neon);<br>
+Â Â Â Â p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_4x8_neon);<br>
+Â Â Â Â p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_4x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_8x4_neon);<br>
+Â Â Â Â p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED]Â Â = PFX(pixel_avg_pp_8x8_neon);<br>
+Â Â Â Â p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_8x16_neon);<br>
+Â Â Â Â p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_8x32_neon);<br>
+Â Â Â Â p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_12x16_neon);<br>
+Â Â Â Â p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_16x4_neon);<br>
+Â Â Â Â p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_16x8_neon);<br>
+Â Â Â Â p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x12_neon);<br>
+Â Â Â Â p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x16_neon);<br>
+Â Â Â Â p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x32_neon);<br>
+Â Â Â Â p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_16x64_neon);<br>
+Â Â Â Â p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_24x32_neon);<br>
+Â Â Â Â p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED]Â = PFX(pixel_avg_pp_32x8_neon);<br>
+Â Â Â Â p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x16_neon);<br>
+Â Â Â Â p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x24_neon);<br>
+Â Â Â Â p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x32_neon);<br>
+Â Â Â Â p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_32x64_neon);<br>
+Â Â Â Â p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_48x64_neon);<br>
+Â Â Â Â p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x16_neon);<br>
+Â Â Â Â p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x32_neon);<br>
+Â Â Â Â p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x48_neon);<br>
+Â Â Â Â p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = PFX(pixel_avg_pp_64x64_neon);<br>
<br>
     // planecopy<br>
     p.planecopy_cp = PFX(pixel_planecopy_cp_neon);<br>
diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp<br>
index 26c82ea50..2eacfe4a9 100644<br>
--- a/source/common/cpu.cpp<br>
+++ b/source/common/cpu.cpp<br>
@@ -5,6 +5,8 @@<br>
 *     Laurent Aimar <<a href="mailto:fenrir@via.ecp.fr" target="_blank">fenrir@via.ecp.fr</a>><br>
 *     Fiona Glaser <<a href="mailto:fiona@x264.com" target="_blank">fiona@x264.com</a>><br>
 *     Steve Borho <<a href="mailto:steve@borho.org" target="_blank">steve@borho.org</a>><br>
+ *Â Â Â Â Â Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *Â Â Â Â Â Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -367,6 +369,8 @@ uint32_t cpu_detect(bool benableavx512)<br>
   flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0;<br>
 #endif<br>
   // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 (fast mrc)<br>
+#elif X265_ARCH_ARM64<br>
+Â Â flags |= X265_CPU_NEON;<br>
 #endif // if HAVE_ARMV6<br>
   return flags;<br>
 }<br>
diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp<br>
index 99b84449c..e4f890cd5 100644<br>
--- a/source/common/pixel.cpp<br>
+++ b/source/common/pixel.cpp<br>
@@ -5,6 +5,7 @@<br>
 *     Mandar Gurav <<a href="mailto:mandar@multicorewareinc.com" target="_blank">mandar@multicorewareinc.com</a>><br>
 *     Mahesh Pittala <<a href="mailto:mahesh@multicorewareinc.com" target="_blank">mahesh@multicorewareinc.com</a>><br>
 *     Min Chen <<a href="mailto:min.chen@multicorewareinc.com" target="_blank">min.chen@multicorewareinc.com</a>><br>
+ *Â Â Â Â Â Hongbin Liu<<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -265,6 +266,10 @@ int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s<br>
 {<br>
   int satd = 0;<br>
<br>
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64<br>
+Â Â pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;<br>
+#endif<u></u><u></u></span></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><span lang="EN-US">is there any specific reason why the above code is added?? is this a kind of a temporary fix for the output mismatch between c and asm code? <u></u><u></u></span></p>
<p class="MsoNormal"><span lang="EN-US">No, c and asm output is matched. Currently we only complete partial satd primatives. This is a workaround that improve all satd primitives with asm code. Maybe there is a bad code style.</span></p></div></div></div></div></div></blockquote><div>If I understand correctly, you are trying to use a combination of c and asm code for all other kernel sizes that you have not completed asm implementation yet? </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;padding-left:1ex;border-left-color:rgb(204,204,204);border-left-width:1px;border-left-style:solid"><div lang="ZH-CN"><div class="gmail-m_3550197732202863311WordSection1"><div><div><div><p class="MsoNormal"><span lang="EN-US" style="color:rgb(31,73,125);font-family:"Calibri",sans-serif;font-size:10.5pt"><u></u><u></u></span></p>
<p class="MsoNormal"><span lang="EN-US" style="color:rgb(31,73,125);font-family:"Calibri",sans-serif;font-size:10.5pt"><u></u>Â <u></u></span></p>
</div>
<blockquote style="border-width:medium medium medium 1pt;border-style:none none none solid;border-color:currentColor currentColor currentColor rgb(204,204,204);margin:5pt 0cm 5pt 4.8pt;padding:0cm 0cm 0cm 6pt">
<div>
<div>
<p class="MsoNormal"><span lang="EN-US">+<br>
   for (int row = 0; row < h; row += 4)<br>
     for (int col = 0; col < w; col += 4)<br>
       satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,<br>
@@ -279,6 +284,10 @@ int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s<br>
 {<br>
   int satd = 0;<br>
<br>
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64<br>
+Â Â pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;<br>
+#endif<br>
+<u></u><u></u></span></p>
</div>
</div>
</blockquote>
<div>
<p class="MsoNormal"><span lang="EN-US">Same comment as above. <u></u><u></u></span></p>
<p class="MsoNormal"><span lang="EN-US">Same response.<u></u><u></u></span></p>
</div>
<blockquote style="border-width:medium medium medium 1pt;border-style:none none none solid;border-color:currentColor currentColor currentColor rgb(204,204,204);margin:5pt 0cm 5pt 4.8pt;padding:0cm 0cm 0cm 6pt">
<div>
<div>
<p class="MsoNormal" style="margin-bottom:12pt"><span lang="EN-US">Â Â Â for (int row = 0; row < h; row += 4)<br>
     for (int col = 0; col < w; col += 8)<br>
       satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,<br>
diff --git a/source/common/primitives.h b/source/common/primitives.h<br>
index 5c64952fb..0b52f84de 100644<br>
--- a/source/common/primitives.h<br>
+++ b/source/common/primitives.h<br>
@@ -8,6 +8,8 @@<br>
 *     Rajesh Paulraj <<a href="mailto:rajesh@multicorewareinc.com" target="_blank">rajesh@multicorewareinc.com</a>><br>
 *     Praveen Kumar Tiwari <<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>><br>
 *     Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>><br>
+ *Â Â Â Â Â Hongbin Liu<<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
+ *Â Â Â Â Â Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -467,6 +469,9 @@ void setupCPrimitives(EncoderPrimitives &p);<br>
 void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);<br>
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);<br>
 void setupAliasPrimitives(EncoderPrimitives &p);<br>
+#if X265_ARCH_ARM64<br>
+void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);<br>
+#endif<br>
 #if HAVE_ALTIVEC<br>
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);<br>
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);<br>
@@ -481,4 +486,10 @@ extern const char* PFX(version_str);<br>
 extern const char* PFX(build_info_str);<br>
 #endif<br>
<br>
+#if ENABLE_ASSEMBLY && X265_ARCH_ARM64<br>
+extern "C" {<br>
+#include "aarch64/pixel-util.h"<br>
+}<br>
+#endif<br>
+<br>
 #endif // ifndef X265_PRIMITIVES_H<br>
diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt<br>
index 260195f53..9abaf31ff 100644<br>
--- a/source/test/CMakeLists.txt<br>
+++ b/source/test/CMakeLists.txt<br>
@@ -23,13 +23,15 @@ endif(X86)<br>
<br>
 # add ARM assembly files<br>
 if(ARM OR CROSS_COMPILE_ARM)<br>
-Â Â enable_language(ASM)<br>
-Â Â set(NASM_SRC checkasm-arm.S)<br>
-Â Â add_custom_command(<br>
-Â Â Â Â OUTPUT checkasm-arm.obj<br>
-Â Â Â Â COMMAND ${CMAKE_CXX_COMPILER}<br>
-Â Â Â Â ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj<br>
-Â Â Â Â DEPENDS checkasm-arm.S)<br>
+Â Â if(NOT ARM64)<br>
+Â Â Â Â enable_language(ASM)<br>
+Â Â Â Â set(NASM_SRC checkasm-arm.S)<br>
+Â Â Â Â add_custom_command(<br>
+Â Â Â Â Â Â OUTPUT checkasm-arm.obj<br>
+Â Â Â Â Â Â COMMAND ${CMAKE_CXX_COMPILER}<br>
+Â Â Â Â Â Â ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o checkasm-arm.obj<br>
+Â Â Â Â Â Â DEPENDS checkasm-arm.S)<br>
+Â Â endif()<br>
 endif(ARM OR CROSS_COMPILE_ARM)<br>
<br>
 # add PowerPC assembly files<br>
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp<br>
index ac14f9710..8db8c0c25 100644<br>
--- a/source/test/testbench.cpp<br>
+++ b/source/test/testbench.cpp<br>
@@ -5,6 +5,7 @@<br>
 *     Mandar Gurav <<a href="mailto:mandar@multicorewareinc.com" target="_blank">mandar@multicorewareinc.com</a>><br>
 *     Mahesh Pittala <<a href="mailto:mahesh@multicorewareinc.com" target="_blank">mahesh@multicorewareinc.com</a>><br>
 *     Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>><br>
+ *Â Â Â Â Â Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -208,6 +209,14 @@ int main(int argc, char *argv[])<br>
     EncoderPrimitives asmprim;<br>
     memset(&asmprim, 0, sizeof(asmprim));<br>
     setupAssemblyPrimitives(asmprim, test_arch[i].flag);<br>
+<br>
+#if X265_ARCH_ARM64<br>
+Â Â Â Â /* Temporary workaround because luma_vsp assembly primitive has not been completed<br>
+Â Â Â Â Â * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.<br>
+Â Â Â Â Â * Otherwise, segment fault occurs. */<br>
+Â Â Â Â setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag);<br>
+#endif<br>
+<br>
     setupAliasPrimitives(asmprim);<br>
     memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives));<br>
     for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)<br>
@@ -232,6 +241,13 @@ int main(int argc, char *argv[])<br>
 #endif<br>
   setupAssemblyPrimitives(optprim, cpuid);<br>
<br>
+#if X265_ARCH_ARM64<br>
+Â Â /* Temporary workaround because luma_vsp assembly primitive has not been completed<br>
+Â Â Â * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly primitive.<br>
+Â Â Â * Otherwise, segment fault occurs. */<br>
+Â Â setupAliasCPrimitives(cprim, optprim, cpuid);<br>
+#endif<br>
+<br>
   /* Note that we do not setup aliases for performance tests, that would be<br>
   * redundant. The testbench only verifies they are correctly aliased */<br>
<br>
diff --git a/source/test/testharness.h b/source/test/testharness.h<br>
index 771551583..6e680953f 100644<br>
--- a/source/test/testharness.h<br>
+++ b/source/test/testharness.h<br>
@@ -3,6 +3,7 @@<br>
 *<br>
 * Authors: Steve Borho <<a href="mailto:steve@borho.org" target="_blank">steve@borho.org</a>><br>
 *     Min Chen <<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>><br>
+ *Â Â Â Â Â Yimeng Su <<a href="mailto:yimeng.su@huawei.com" target="_blank">yimeng.su@huawei.com</a>><br>
 *<br>
 * This program is free software; you can redistribute it and/or modify<br>
 * it under the terms of the GNU General Public License as published by<br>
@@ -81,11 +82,15 @@ static inline uint32_t __rdtsc(void)<br>
 #if X265_ARCH_X86<br>
   asm volatile("rdtsc" : "=a" (a) ::"edx");<br>
 #elif X265_ARCH_ARM<br>
+#if X265_ARCH_ARM64<br>
+Â Â asm volatile("mrs %0, cntvct_el0" : "=r"(a));<br>
+#else<br>
   // TOD-DO: verify following inline asm to get cpu Timestamp Counter for ARM arch<br>
   // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a));<br>
<br>
   // TO-DO: replace clock() function with appropriate ARM cpu instructions<br>
   a = clock();<br>
+#endif<br>
 #endif<br>
   return a;<br>
 }<br>
-- <br>
2.21.0.windows.1<u></u><u></u></span></p>
</div>
</div>
<p class="MsoNormal"><span lang="EN-US">_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><u></u><u></u></span></p>
</blockquote>
</div>
</div>
</div>
</div>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank" rel="noreferrer">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div></div>