[x265] [PATCH 3/4] AArch64: Add SBD pixel_var Neon DotProd intrinsics implementations

Fri Jun 20 09:37:06 UTC 2025

Hi Chen,

Sorry I did not notice it! I sent a new v2 patch series with the fixes.

Thanks,
Li

From: chen <chenm003 at 163.com>
Date: Friday, 2025. June 20. at 6:25
To: Development for x265 <x265-devel at videolan.org>
Cc: nd <nd at arm.com>, Li Zhang <Li.Zhang2 at arm.com>
Subject: Re:[x265] [PATCH 3/4] AArch64: Add SBD pixel_var Neon DotProd intrinsics implementations

Hi Li,

We found some warning when HIGH_BIT_DEPTH=1, could you please fix it?

+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p)

+{

+#if !HIGH_BIT_DEPTH

+    p.cu[BLOCK_4x4].var   = pixel_var_neon_dotprod<4>;

+    p.cu[BLOCK_8x8].var   = pixel_var_neon_dotprod<8>;

+    p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>;

+    p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>;

+    p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>;

+#endif // !HIGH_BIT_DEPTH

+}

At 2025-06-18 02:23:14, "Li Zhang" <li.zhang2 at arm.com> wrote:

>Add Neon DotProd intrinsics implementation for the standard bit-depth

>pixel_var functions.

>

>This implementation is 1.2x-2.4x faster than the existing Armv8.0 Neon

>implementation.

>---

> source/common/CMakeLists.txt                  |   2 +-

> source/common/aarch64/asm-primitives.cpp      |   1 +

> .../aarch64/pixel-prim-neon-dotprod.cpp       | 111 ++++++++++++++++++

> source/common/aarch64/pixel-prim.h            |   3 +

> 4 files changed, 116 insertions(+), 1 deletion(-)

> create mode 100644 source/common/aarch64/pixel-prim-neon-dotprod.cpp

>

>diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt

>index 405ec0b2d..14a837429 100644

>--- a/source/common/CMakeLists.txt

>+++ b/source/common/CMakeLists.txt

>@@ -105,7 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))

>

>     # Add Arm intrinsics files here.

>     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)

>-    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)

>+    set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp)

>     set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)

>     set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp)

>     set(C_SRCS_SVE2 sao-prim-sve2.cpp)

>diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp

>index d8b0beb8f..e3f8788dd 100644

>--- a/source/common/aarch64/asm-primitives.cpp

>+++ b/source/common/aarch64/asm-primitives.cpp

>@@ -776,6 +776,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)

>     if (cpuMask & X265_CPU_NEON_DOTPROD)

>     {

>         setupFilterPrimitives_neon_dotprod(p);

>+        setupPixelPrimitives_neon_dotprod(p);

>     }

> #endif

> #ifdef HAVE_NEON_I8MM

>diff --git a/source/common/aarch64/pixel-prim-neon-dotprod.cpp b/source/common/aarch64/pixel-prim-neon-dotprod.cpp

>new file mode 100644

>index 000000000..16e9fb2c2

>--- /dev/null

>+++ b/source/common/aarch64/pixel-prim-neon-dotprod.cpp

>@@ -0,0 +1,111 @@

>+/*****************************************************************************

>+ * Copyright (C) 2025 MulticoreWare, Inc

>+ *

>+ * Authors: Li Zhang <li.zhang2 at arm.com>

>+ *

>+ * This program is free software; you can redistribute it and/or modify

>+ * it under the terms of the GNU General Public License as published by

>+ * the Free Software Foundation; either version 2 of the License, or

>+ * (at your option) any later version.

>+ *

>+ * This program is distributed in the hope that it will be useful,

>+ * but WITHOUT ANY WARRANTY; without even the implied warranty of

>+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

>+ * GNU General Public License for more details.

>+ *

>+ * You should have received a copy of the GNU General Public License

>+ * along with this program; if not, write to the Free Software

>+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.

>+ *

>+ * This program is also available under a commercial proprietary license.

>+ * For more information, contact us at license @ x265.com.

>+ *****************************************************************************/

>+

>+#include "pixel-prim.h"

>+#include "mem-neon.h"

>+

>+#include <arm_neon.h>

>+

>+namespace

>+{

>+#if !HIGH_BIT_DEPTH

>+template<int size>

>+uint64_t pixel_var_neon_dotprod(const uint8_t *pix, intptr_t i_stride)

>+{

>+    if (size >= 16)

>+    {

>+        uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };

>+        uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };

>+

>+        for (int h = 0; h < size; h += 2)

>+        {

>+            for (int w = 0; w + 16 <= size; w += 16)

>+            {

>+                uint8x16_t s[2];

>+                load_u8x16xn<2>(pix + w, i_stride, s);

>+

>+                sum[0] = vdotq_u32(sum[0], s[0], vdupq_n_u8(1));

>+                sum[1] = vdotq_u32(sum[1], s[1], vdupq_n_u8(1));

>+

>+                sqr[0] = vdotq_u32(sqr[0], s[0], s[0]);

>+                sqr[1] = vdotq_u32(sqr[1], s[1], s[1]);

>+            }

>+

>+            pix += 2 * i_stride;

>+        }

>+

>+        sum[0] = vaddq_u32(sum[0], sum[1]);

>+        sqr[0] = vaddq_u32(sqr[0], sqr[1]);

>+

>+        return vaddvq_u32(sum[0]) + (vaddlvq_u32(sqr[0]) << 32);

>+    }

>+    if (size == 8)

>+    {

>+        uint16x8_t sum = vdupq_n_u16(0);

>+        uint32x2_t sqr = vdup_n_u32(0);

>+

>+        for (int h = 0; h < size; ++h)

>+        {

>+            uint8x8_t s = vld1_u8(pix);

>+

>+            sum = vaddw_u8(sum, s);

>+            sqr = vdot_u32(sqr, s, s);

>+

>+            pix += i_stride;

>+        }

>+

>+        return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);

>+    }

>+    if (size == 4) {

>+        uint16x8_t sum = vdupq_n_u16(0);

>+        uint32x2_t sqr = vdup_n_u32(0);

>+

>+        for (int h = 0; h < size; h += 2)

>+        {

>+            uint8x8_t s = load_u8x4x2(pix, i_stride);

>+

>+            sum = vaddw_u8(sum, s);

>+            sqr = vdot_u32(sqr, s, s);

>+

>+            pix += 2 * i_stride;

>+        }

>+

>+        return vaddvq_u16(sum) + (vaddlv_u32(sqr) << 32);

>+    }

>+}

>+#endif // !HIGH_BIT_DEPTH

>+}

>+

>+namespace X265_NS

>+{

>+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p)

>+{

>+#if !HIGH_BIT_DEPTH

>+    p.cu[BLOCK_4x4].var   = pixel_var_neon_dotprod<4>;

>+    p.cu[BLOCK_8x8].var   = pixel_var_neon_dotprod<8>;

>+    p.cu[BLOCK_16x16].var = pixel_var_neon_dotprod<16>;

>+    p.cu[BLOCK_32x32].var = pixel_var_neon_dotprod<32>;

>+    p.cu[BLOCK_64x64].var = pixel_var_neon_dotprod<64>;

>+#endif // !HIGH_BIT_DEPTH

>+}

>+}

>diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h

>index da9f1822b..74271b10c 100644

>--- a/source/common/aarch64/pixel-prim.h

>+++ b/source/common/aarch64/pixel-prim.h

>@@ -15,6 +15,9 @@ namespace X265_NS

>

> void setupPixelPrimitives_neon(EncoderPrimitives &p);

>

>+#if defined(HAVE_NEON_DOTPROD)

>+void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p);

>+#endif

>

> }

>

>--

>2.39.5 (Apple Git-154)

>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250620/0fff8ec7/attachment-0001.htm>