[x265] [PATCH v2 4/4] AArch64: Add HBD pixel_var SVE intrinsics implementations

Fri Jun 20 09:34:17 UTC 2025

Add SVE intrinsics implementation for standard bit-depth pixel_var
functions making use of the 16-bit dot product instruction.

This implementation is 1.0x-1.7x faster than the existing Armv8.0 Neon
implementation depending on the block sizes.
---
 source/common/CMakeLists.txt             |   2 +-
 source/common/aarch64/asm-primitives.cpp |   1 +
 source/common/aarch64/neon-sve-bridge.h  |   7 ++
 source/common/aarch64/pixel-prim-sve.cpp | 141 +++++++++++++++++++++++
 source/common/aarch64/pixel-prim.h       |   3 +
 5 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 source/common/aarch64/pixel-prim-sve.cpp

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 14a837429..fdb15e756 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
-    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp pixel-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index e3f8788dd..49d980616 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -791,6 +791,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
         setupFilterPrimitives_sve(p);
         setupSaoPrimitives_sve(p);
         setupDCTPrimitives_sve(p);
+        setupPixelPrimitives_sve(p);
     }
 #endif
 #if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h
index 48f89ea6e..6b450474a 100644
--- a/source/common/aarch64/neon-sve-bridge.h
+++ b/source/common/aarch64/neon-sve-bridge.h
@@ -58,6 +58,13 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
                                        svset_neonq_s16(svundef_s16(), s0),  \
                                        svset_neonq_s16(svundef_s16(), f), lane))
 
+static inline uint64x2_t x265_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y)
+{
+    return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                     svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), y)));
+}
+
 static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx)
 {
     return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x),
diff --git a/source/common/aarch64/pixel-prim-sve.cpp b/source/common/aarch64/pixel-prim-sve.cpp
new file mode 100644
index 000000000..3bcb993cc
--- /dev/null
+++ b/source/common/aarch64/pixel-prim-sve.cpp
@@ -0,0 +1,141 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2 at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_sve(const uint16_t *pix, intptr_t i_stride)
+{
+    if (size > 16)
+    {
+        uint64x2_t sum[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+        uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            for (int w = 0; w + 16 <= size; w += 16)
+            {
+                uint16x8_t s[2];
+                load_u16x8xn<2>(pix + w, 8, s);
+
+                sum[0] = x265_udotq_u16(sum[0], s[0], vdupq_n_u16(1));
+                sum[1] = x265_udotq_u16(sum[1], s[1], vdupq_n_u16(1));
+
+                sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]);
+                sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]);
+            }
+
+            pix += i_stride;
+        }
+
+        sum[0] = vaddq_u64(sum[0], sum[1]);
+        sqr[0] = vaddq_u64(sqr[0], sqr[1]);
+
+        return vaddvq_u64(sum[0]) + (vaddvq_u64(sqr[0]) << 32);
+    }
+    if (size == 16)
+    {
+        uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+        uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s[2];
+            load_u16x8xn<2>(pix, 8, s);
+
+            sum[0] = vaddq_u16(sum[0], s[0]);
+            sum[1] = vaddq_u16(sum[1], s[1]);
+
+            sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]);
+            sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]);
+
+            pix += i_stride;
+        }
+
+        uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+        sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+        sqr[0] = vaddq_u64(sqr[0], sqr[1]);
+
+        return vaddvq_u32(sum_u32) + (vaddvq_u64(sqr[0]) << 32);
+    }
+    if (size == 8)
+    {
+        uint16x8_t sum = vdupq_n_u16(0);
+        uint64x2_t sqr = vdupq_n_u64(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s = vld1q_u16(pix);
+
+            sum = vaddq_u16(sum, s);
+            sqr = x265_udotq_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddlvq_u16(sum) + (vaddvq_u64(sqr) << 32);
+    }
+    if (size == 4) {
+        uint16x4_t sum = vdup_n_u16(0);
+        uint32x4_t sqr = vdupq_n_u32(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x4_t s = vld1_u16(pix);
+
+            sum = vadd_u16(sum, s);
+            sqr = vmlal_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddv_u16(sum) + (vaddlvq_u32(sqr) << 32);
+    }
+}
+#endif // HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+#if HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &p)
+{
+    p.cu[BLOCK_4x4].var   = pixel_var_sve<4>;
+    p.cu[BLOCK_8x8].var   = pixel_var_sve<8>;
+    p.cu[BLOCK_16x16].var = pixel_var_sve<16>;
+    p.cu[BLOCK_32x32].var = pixel_var_sve<32>;
+    p.cu[BLOCK_64x64].var = pixel_var_sve<64>;
+}
+#else // !HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &)
+{
+}
+#endif // HIGH_BIT_DEPTH
+}
diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h
index 74271b10c..dac00995e 100644
--- a/source/common/aarch64/pixel-prim.h
+++ b/source/common/aarch64/pixel-prim.h
@@ -19,6 +19,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p);
 void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p);
 #endif
 
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupPixelPrimitives_sve(EncoderPrimitives &p);
+#endif
 }
 
 
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From b5a820e6dafb21ebaf30f8a93d316299be313f1b Mon Sep 17 00:00:00 2001
Message-Id: <b5a820e6dafb21ebaf30f8a93d316299be313f1b.1750411081.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1750411081.git.li.zhang2 at arm.com>
References: <cover.1750411081.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 16 Jun 2025 17:23:00 +0200
Subject: [PATCH v2 4/4] AArch64: Add HBD pixel_var SVE intrinsics
 implementations

Add SVE intrinsics implementation for standard bit-depth pixel_var
functions making use of the 16-bit dot product instruction.

This implementation is 1.0x-1.7x faster than the existing Armv8.0 Neon
implementation depending on the block sizes.
---
 source/common/CMakeLists.txt             |   2 +-
 source/common/aarch64/asm-primitives.cpp |   1 +
 source/common/aarch64/neon-sve-bridge.h  |   7 ++
 source/common/aarch64/pixel-prim-sve.cpp | 141 +++++++++++++++++++++++
 source/common/aarch64/pixel-prim.h       |   3 +
 5 files changed, 153 insertions(+), 1 deletion(-)
 create mode 100644 source/common/aarch64/pixel-prim-sve.cpp

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 14a837429..fdb15e756 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)
     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
-    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)
+    set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp pixel-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index e3f8788dd..49d980616 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -791,6 +791,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
         setupFilterPrimitives_sve(p);
         setupSaoPrimitives_sve(p);
         setupDCTPrimitives_sve(p);
+        setupPixelPrimitives_sve(p);
     }
 #endif
 #if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE
diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h
index 48f89ea6e..6b450474a 100644
--- a/source/common/aarch64/neon-sve-bridge.h
+++ b/source/common/aarch64/neon-sve-bridge.h
@@ -58,6 +58,13 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y)
                                        svset_neonq_s16(svundef_s16(), s0),  \
                                        svset_neonq_s16(svundef_s16(), f), lane))
 
+static inline uint64x2_t x265_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y)
+{
+    return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                     svset_neonq_u16(svundef_u16(), x),
+                                     svset_neonq_u16(svundef_u16(), y)));
+}
+
 static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx)
 {
     return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x),
diff --git a/source/common/aarch64/pixel-prim-sve.cpp b/source/common/aarch64/pixel-prim-sve.cpp
new file mode 100644
index 000000000..3bcb993cc
--- /dev/null
+++ b/source/common/aarch64/pixel-prim-sve.cpp
@@ -0,0 +1,141 @@
+/*****************************************************************************
+ * Copyright (C) 2025 MulticoreWare, Inc
+ *
+ * Authors: Li Zhang <li.zhang2 at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "pixel-prim.h"
+#include "mem-neon.h"
+#include "neon-sve-bridge.h"
+
+#include <arm_neon.h>
+
+namespace
+{
+#if HIGH_BIT_DEPTH
+template<int size>
+uint64_t pixel_var_sve(const uint16_t *pix, intptr_t i_stride)
+{
+    if (size > 16)
+    {
+        uint64x2_t sum[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+        uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            for (int w = 0; w + 16 <= size; w += 16)
+            {
+                uint16x8_t s[2];
+                load_u16x8xn<2>(pix + w, 8, s);
+
+                sum[0] = x265_udotq_u16(sum[0], s[0], vdupq_n_u16(1));
+                sum[1] = x265_udotq_u16(sum[1], s[1], vdupq_n_u16(1));
+
+                sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]);
+                sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]);
+            }
+
+            pix += i_stride;
+        }
+
+        sum[0] = vaddq_u64(sum[0], sum[1]);
+        sqr[0] = vaddq_u64(sqr[0], sqr[1]);
+
+        return vaddvq_u64(sum[0]) + (vaddvq_u64(sqr[0]) << 32);
+    }
+    if (size == 16)
+    {
+        uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+        uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s[2];
+            load_u16x8xn<2>(pix, 8, s);
+
+            sum[0] = vaddq_u16(sum[0], s[0]);
+            sum[1] = vaddq_u16(sum[1], s[1]);
+
+            sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]);
+            sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]);
+
+            pix += i_stride;
+        }
+
+        uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+        sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+        sqr[0] = vaddq_u64(sqr[0], sqr[1]);
+
+        return vaddvq_u32(sum_u32) + (vaddvq_u64(sqr[0]) << 32);
+    }
+    if (size == 8)
+    {
+        uint16x8_t sum = vdupq_n_u16(0);
+        uint64x2_t sqr = vdupq_n_u64(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x8_t s = vld1q_u16(pix);
+
+            sum = vaddq_u16(sum, s);
+            sqr = x265_udotq_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddlvq_u16(sum) + (vaddvq_u64(sqr) << 32);
+    }
+    if (size == 4) {
+        uint16x4_t sum = vdup_n_u16(0);
+        uint32x4_t sqr = vdupq_n_u32(0);
+
+        for (int h = 0; h < size; ++h)
+        {
+            uint16x4_t s = vld1_u16(pix);
+
+            sum = vadd_u16(sum, s);
+            sqr = vmlal_u16(sqr, s, s);
+
+            pix += i_stride;
+        }
+
+        return vaddv_u16(sum) + (vaddlvq_u32(sqr) << 32);
+    }
+}
+#endif // HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+#if HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &p)
+{
+    p.cu[BLOCK_4x4].var   = pixel_var_sve<4>;
+    p.cu[BLOCK_8x8].var   = pixel_var_sve<8>;
+    p.cu[BLOCK_16x16].var = pixel_var_sve<16>;
+    p.cu[BLOCK_32x32].var = pixel_var_sve<32>;
+    p.cu[BLOCK_64x64].var = pixel_var_sve<64>;
+}
+#else // !HIGH_BIT_DEPTH
+void setupPixelPrimitives_sve(EncoderPrimitives &)
+{
+}
+#endif // HIGH_BIT_DEPTH
+}
diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h
index 74271b10c..dac00995e 100644
--- a/source/common/aarch64/pixel-prim.h
+++ b/source/common/aarch64/pixel-prim.h
@@ -19,6 +19,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p);
 void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p);
 #endif
 
+#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
+void setupPixelPrimitives_sve(EncoderPrimitives &p);
+#endif
 }
 
 
-- 
2.39.5 (Apple Git-154)