[x265] [PATCH 3/4] AArch64: Add SBD pixel_var Neon DotProd intrinsics implementations
Li Zhang
li.zhang2 at arm.com
Tue Jun 17 18:23:14 UTC 2025
Add Neon DotProd intrinsics implementation for the standard bit-depth
pixel_var functions.
This implementation is 1.2x-2.4x faster than the existing Armv8.0 Neon
implementation.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 1 +
.../aarch64/pixel-prim-neon-dotprod.cpp | 111 ++++++++++++++++++
source/common/aarch64/pixel-prim.h | 3 +
4 files changed, 116 insertions(+), 1 deletion(-)
create mode 100644 source/common/aarch64/pixel-prim-neon-dotprod.cpp
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 405ec0b2d..14a837429 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -105,7 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
# Add Arm intrinsics files here.
set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h)
- set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+ set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)
set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
set(C_SRCS_SVE2 sao-prim-sve2.cpp)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index d8b0beb8f..e3f8788dd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -776,6 +776,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
if (cpuMask & X265_CPU_NEON_DOTPROD)
{
setupFilterPrimitives_neon_dotprod(p);
+ setupPixelPrimitives_neon_dotprod(p);
}
#endif
#ifdef HAVE_NEON_I8MM
diff --git a/source/common/aarch64/pixel-prim-neon-dotprod.cpp b/source/common/aarch64/pixel-prim-neon-dotprod.cpp
new file mode 100644
index 000000000..16e9fb2c2
--- /dev/null
+++ b/source/common/aarch64/pixel-prim-neon-dotprod.cpp
@@ -0,0 +1,111 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2 at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if !HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_neon_dotprod(const uint8_t *pix, intptr_t i_stride)
+{
+ if (size >= 16)
+ {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ for (int h = 0; h < size; h += 2)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ uint8x16_t s[2];
+ load_u8x16xn<2>(pix + w, i_stride, s);
+
+ sum[0] = vdotq_u32(sum[0], s[0], vdupq_n_u8(1));
+ sum[1] = vdotq_u32(sum[1], s[1], vdupq_n_u8(1));
+
+ sqr[0] = vdotq_u32(sqr[0], s[0], s[0]);
+ sqr[1] = vdotq_u32(sqr[1], s[1], s[1]);
+ }
+
+ pix += 2 * i_stride;
+ }
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sqr[0] = vaddq_u32(sqr[0], sqr[1]);
+
+ return vaddvq_u32(sum[0]) + (vaddlvq_u32(sqr[0]) << 32);
+ }
+ if (size == 8)
+ {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x2_t sqr = vdup_n_u32(0);
+
+ for (int h = 0; h < size; ++h)
+ {
+ uint8x8_t s = vld1_u8(pix);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vdot_u32(sqr, s, s);
+
+ pix += i_stride;
+ }
+
+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+ }
+ if (size == 4) {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x2_t sqr = vdup_n_u32(0);
+
+ for (int h = 0; h < size; h += 2)
+ {
+ uint8x8_t s = load_u8x4x2(pix, i_stride);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vdot_u32(sqr, s, s);
+
+ pix += 2 * i_stride;
+ }
+
+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+ }
+}
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p)
+{
+#if !HIGH_BIT_DEPTH
+ p.cu[BLOCK_4x4].var = pixel_var_neon_dotprod<4>;
+ p.cu[BLOCK_8x8].var = pixel_var_neon_dotprod<8>;
+ p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>;
+ p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>;
+ p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>;
+#endif // !HIGH_BIT_DEPTH
+}
+}
diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h
index da9f1822b..74271b10c 100644
--- a/source/common/aarch64/pixel-prim.h
+++ b/source/common/aarch64/pixel-prim.h
@@ -15,6 +15,9 @@ namespace X265_NS
void setupPixelPrimitives_neon(EncoderPrimitives &p);
+#if defined(HAVE_NEON_DOTPROD)
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p);
+#endif
}
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 11b3ec5a9817077262989e858dd5ef7910fe956a Mon Sep 17 00:00:00 2001
Message-Id: <11b3ec5a9817077262989e858dd5ef7910fe956a.1750183023.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1750183023.git.li.zhang2 at arm.com>
References: <cover.1750183023.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 16 Jun 2025 17:23:00 +0200
Subject: [PATCH 3/4] AArch64: Add SBD pixel_var Neon DotProd intrinsics
implementations
Add Neon DotProd intrinsics implementation for the standard bit-depth
pixel_var functions.
This implementation is 1.2x-2.4x faster than the existing Armv8.0 Neon
implementation.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 1 +
.../aarch64/pixel-prim-neon-dotprod.cpp | 111 ++++++++++++++++++
source/common/aarch64/pixel-prim.h | 3 +
4 files changed, 116 insertions(+), 1 deletion(-)
create mode 100644 source/common/aarch64/pixel-prim-neon-dotprod.cpp
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 405ec0b2d..14a837429 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -105,7 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
# Add Arm intrinsics files here.
set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h)
- set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+ set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)
set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
set(C_SRCS_SVE2 sao-prim-sve2.cpp)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index d8b0beb8f..e3f8788dd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -776,6 +776,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
if (cpuMask & X265_CPU_NEON_DOTPROD)
{
setupFilterPrimitives_neon_dotprod(p);
+ setupPixelPrimitives_neon_dotprod(p);
}
#endif
#ifdef HAVE_NEON_I8MM
diff --git a/source/common/aarch64/pixel-prim-neon-dotprod.cpp b/source/common/aarch64/pixel-prim-neon-dotprod.cpp
new file mode 100644
index 000000000..16e9fb2c2
--- /dev/null
+++ b/source/common/aarch64/pixel-prim-neon-dotprod.cpp
@@ -0,0 +1,111 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2 at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if !HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_neon_dotprod(const uint8_t *pix, intptr_t i_stride)
+{
+ if (size >= 16)
+ {
+ uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+ uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ for (int h = 0; h < size; h += 2)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ uint8x16_t s[2];
+ load_u8x16xn<2>(pix + w, i_stride, s);
+
+ sum[0] = vdotq_u32(sum[0], s[0], vdupq_n_u8(1));
+ sum[1] = vdotq_u32(sum[1], s[1], vdupq_n_u8(1));
+
+ sqr[0] = vdotq_u32(sqr[0], s[0], s[0]);
+ sqr[1] = vdotq_u32(sqr[1], s[1], s[1]);
+ }
+
+ pix += 2 * i_stride;
+ }
+
+ sum[0] = vaddq_u32(sum[0], sum[1]);
+ sqr[0] = vaddq_u32(sqr[0], sqr[1]);
+
+ return vaddvq_u32(sum[0]) + (vaddlvq_u32(sqr[0]) << 32);
+ }
+ if (size == 8)
+ {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x2_t sqr = vdup_n_u32(0);
+
+ for (int h = 0; h < size; ++h)
+ {
+ uint8x8_t s = vld1_u8(pix);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vdot_u32(sqr, s, s);
+
+ pix += i_stride;
+ }
+
+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+ }
+ if (size == 4) {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x2_t sqr = vdup_n_u32(0);
+
+ for (int h = 0; h < size; h += 2)
+ {
+ uint8x8_t s = load_u8x4x2(pix, i_stride);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vdot_u32(sqr, s, s);
+
+ pix += 2 * i_stride;
+ }
+
+ return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);
+ }
+}
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p)
+{
+#if !HIGH_BIT_DEPTH
+ p.cu[BLOCK_4x4].var = pixel_var_neon_dotprod<4>;
+ p.cu[BLOCK_8x8].var = pixel_var_neon_dotprod<8>;
+ p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>;
+ p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>;
+ p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>;
+#endif // !HIGH_BIT_DEPTH
+}
+}
diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h
index da9f1822b..74271b10c 100644
--- a/source/common/aarch64/pixel-prim.h
+++ b/source/common/aarch64/pixel-prim.h
@@ -15,6 +15,9 @@ namespace X265_NS
void setupPixelPrimitives_neon(EncoderPrimitives &p);
+#if defined(HAVE_NEON_DOTPROD)
+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p);
+#endif
}
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list