[x265] [PATCH v2 7/8] AArch64: Add Armv8.4 Neon DotProd implementations of SAD
Hari Limaye
hari.limaye at arm.com
Tue Jul 30 15:46:16 UTC 2024
Add implementations of SAD primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.
The UABD, UDOT instruction sequences use wider (32-bit)
accumulators which simplifies the reductions.
---
source/CMakeLists.txt | 12 ++
source/common/CMakeLists.txt | 2 +
source/common/aarch64/asm-primitives.cpp | 37 +++++
source/common/aarch64/fun-decls.h | 19 +++
source/common/aarch64/sad-neon-dotprod.S | 170 +++++++++++++++++++++++
5 files changed, 240 insertions(+)
create mode 100644 source/common/aarch64/sad-neon-dotprod.S
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index c1179d276..8288fd234 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -669,6 +669,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY)
DEPENDS ${ASM_SRC})
endforeach()
endif()
+ if(CPU_HAS_NEON_DOTPROD)
+ foreach(ASM ${ARM_ASMS_NEON_DOTPROD})
+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM})
+ list(APPEND ASM_SRCS ${ASM_SRC})
+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX})
+ add_custom_command(
+ OUTPUT ${ASM}.${SUFFIX}
+ COMMAND ${CMAKE_CXX_COMPILER}
+ ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX}
+ DEPENDS ${ASM_SRC})
+ endforeach()
+ endif()
elseif(X86)
# compile X86 arch asm files here
foreach(ASM ${MSVC_ASMS})
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index e00e9c1ca..8982dcc7f 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -111,11 +111,13 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+ set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources that use SVE instruction set")
set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources that use SVE2 instruction set")
+ set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension")
foreach(SRC ${C_SRCS_NEON})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
endforeach()
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index b1e6b817b..705881abf 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -154,6 +154,23 @@ extern "C" {
p.pu[LUMA_64x48].prim = fncdef PFX(filterPixelToShort ## _64x48_ ## sve); \
p.pu[LUMA_64x16].prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
p.pu[LUMA_48x64].prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
+#define LUMA_PU_TYPED_MULTIPLE_16(prim, fncdef, fname, cpu) \
+ p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
+ p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
+ p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
+ p.pu[LUMA_16x8].prim = fncdef PFX(fname ## _16x8_ ## cpu); \
+ p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## cpu); \
+ p.pu[LUMA_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
+ p.pu[LUMA_64x32].prim = fncdef PFX(fname ## _64x32_ ## cpu); \
+ p.pu[LUMA_32x64].prim = fncdef PFX(fname ## _32x64_ ## cpu); \
+ p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## cpu); \
+ p.pu[LUMA_16x4].prim = fncdef PFX(fname ## _16x4_ ## cpu); \
+ p.pu[LUMA_32x24].prim = fncdef PFX(fname ## _32x24_ ## cpu); \
+ p.pu[LUMA_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
+ p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
+ p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
+ p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
+ p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
@@ -161,6 +178,7 @@ extern "C" {
#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
+#define LUMA_PU_MULTIPLE_16(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_16(prim, , fname, cpu)
#define ALL_LUMA_PU_T(prim, fname) \
@@ -1160,12 +1178,31 @@ void setupSve2Primitives(EncoderPrimitives &)
#endif // !HIGH_BIT_DEPTH
#endif // defined(HAVE_SVE2)
+#ifdef HAVE_NEON_DOTPROD
+#if !HIGH_BIT_DEPTH
+void setupNeonDotProdPrimitives(EncoderPrimitives &p)
+{
+ LUMA_PU_MULTIPLE_16(sad, pixel_sad, neon_dotprod);
+}
+#else // !HIGH_BIT_DEPTH
+void setupNeonDotProdPrimitives(EncoderPrimitives &)
+{
+}
+#endif // !HIGH_BIT_DEPTH
+#endif // HAVE_NEON_DOTPROD
+
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
{
if (cpuMask & X265_CPU_NEON)
{
setupNeonPrimitives(p);
}
+#ifdef HAVE_NEON_DOTPROD
+ if (cpuMask & X265_CPU_NEON_DOTPROD)
+ {
+ setupNeonDotProdPrimitives(p);
+ }
+#endif
#ifdef HAVE_SVE
if (cpuMask & X265_CPU_SVE)
{
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index ec17deda2..ad357f245 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -69,6 +69,24 @@
ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+#define FUNCDEF_PU_MULT_16(ret, name, cpu, ...) \
+ ret PFX(name ## _16x16_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _32x32_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _64x64_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _16x8_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _16x32_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _32x16_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _64x32_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _32x64_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _16x12_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _16x4_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _32x24_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _32x8_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _64x48_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _48x64_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _64x16_ ## cpu)(__VA_ARGS__); \
+ ret PFX(name ## _16x64_ ## cpu)(__VA_ARGS__)
+
#define FUNCDEF_CHROMA_PU(ret, name, cpu, ...) \
FUNCDEF_PU(ret, name, cpu, __VA_ARGS__); \
ret PFX(name ## _4x2_ ## cpu)(__VA_ARGS__); \
@@ -154,6 +172,7 @@ DECLS(neon);
DECLS(sve);
DECLS(sve2);
+FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
diff --git a/source/common/aarch64/sad-neon-dotprod.S b/source/common/aarch64/sad-neon-dotprod.S
new file mode 100644
index 000000000..c51ddb527
--- /dev/null
+++ b/source/common/aarch64/sad-neon-dotprod.S
@@ -0,0 +1,170 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.arch armv8.2-a+dotprod
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+// Fully unrolled with single accumulator for smaller block heights.
+.macro SAD_NEON_DOTPROD_16_S h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+ movi v0.16b, #0
+ movi v1.16b, #1
+.rept \h - 2
+ ldr q2, [x0]
+ ldr q3, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ uabd v4.16b, v2.16b, v3.16b
+ udot v0.4s, v4.16b, v1.16b
+.endr
+ ldr q2, [x0]
+ ldr q3, [x2]
+ uabd v4.16b, v2.16b, v3.16b
+ udot v0.4s, v4.16b, v1.16b
+ ldr q2, [x0, x1]
+ ldr q3, [x2, x3]
+ uabd v4.16b, v2.16b, v3.16b
+ udot v0.4s, v4.16b, v1.16b
+
+ addv s0, v0.4s
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+.macro SAD_NEON_DOTPROD_START
+ // v31: 1 across all lanes for use in UDOT instructions.
+ movi v31.16b, #1
+ movi v16.16b, #0
+ movi v17.16b, #0
+.endm
+
+.macro SAD_NEON_DOTPROD_END
+ add v16.4s, v16.4s, v17.4s
+ addv s0, v16.4s
+ fmov w0, s0
+ ret
+.endm
+
+// Fully unrolled.
+.macro SAD_NEON_DOTPROD_16 h
+function PFX(pixel_sad_16x\h\()_neon_dotprod)
+ SAD_NEON_DOTPROD_START
+.rept \h / 2
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ ld1 {v2.16b}, [x2], x3
+ ld1 {v3.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v2.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v1.16b, v3.16b
+ udot v17.4s, v21.16b, v31.16b
+.endr
+ SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+// Process four rows of width 32.
+.macro SAD_NEON_DOTPROD_32
+.rept 4
+ ld1 {v0.16b-v1.16b}, [x0], x1
+ ld1 {v2.16b-v3.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v2.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v1.16b, v3.16b
+ udot v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 48.
+.macro SAD_NEON_DOTPROD_48
+.rept 4
+ ld1 {v0.16b-v2.16b}, [x0], x1
+ ld1 {v4.16b-v6.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v4.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v1.16b, v5.16b
+ udot v17.4s, v21.16b, v31.16b
+ uabd v20.16b, v2.16b, v6.16b
+ udot v16.4s, v20.16b, v31.16b
+.endr
+.endm
+
+// Process four rows of width 64.
+.macro SAD_NEON_DOTPROD_64
+.rept 4
+ ld1 {v0.16b-v3.16b}, [x0], x1
+ ld1 {v4.16b-v7.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v4.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v1.16b, v5.16b
+ udot v17.4s, v21.16b, v31.16b
+ uabd v20.16b, v2.16b, v6.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v3.16b, v7.16b
+ udot v17.4s, v21.16b, v31.16b
+.endr
+.endm
+
+// Loop unrolled to process 4 rows per iteration.
+.macro SAD_NEON_DOTPROD_LOOP w, h
+function PFX(pixel_sad_\w\()x\h\()_neon_dotprod)
+ SAD_NEON_DOTPROD_START
+ mov w9, #\h/4
+.Loop_\w\()x\h:
+ sub w9, w9, #1
+
+ SAD_NEON_DOTPROD_\w
+
+ cbnz w9, .Loop_\w\()x\h
+ SAD_NEON_DOTPROD_END
+endfunc
+.endm
+
+SAD_NEON_DOTPROD_16_S 4
+SAD_NEON_DOTPROD_16_S 8
+SAD_NEON_DOTPROD_16_S 12
+SAD_NEON_DOTPROD_16_S 16
+SAD_NEON_DOTPROD_16 32
+SAD_NEON_DOTPROD_16 64
+SAD_NEON_DOTPROD_LOOP 32, 8
+SAD_NEON_DOTPROD_LOOP 32, 16
+SAD_NEON_DOTPROD_LOOP 32, 24
+SAD_NEON_DOTPROD_LOOP 32, 32
+SAD_NEON_DOTPROD_LOOP 32, 64
+SAD_NEON_DOTPROD_LOOP 48, 64
+SAD_NEON_DOTPROD_LOOP 64, 16
+SAD_NEON_DOTPROD_LOOP 64, 32
+SAD_NEON_DOTPROD_LOOP 64, 48
+SAD_NEON_DOTPROD_LOOP 64, 64
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0007-AArch64-Add-Armv8.4-Neon-DotProd-implementations-.patch
Type: text/x-patch
Size: 13452 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240730/763dbaee/attachment.bin>
More information about the x265-devel
mailing list