[x265] [PATCH 02/14] AArch64: Add Armv8.4 Neon DotProd implementations of luma_hpp
Hari Limaye
hari.limaye at arm.com
Fri Sep 6 13:33:01 UTC 2024
Add implementations of luma_hpp primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.
Geomean uplift across all block sizes for luma filters, relative to
Armv8.0 Neon implementation:
Neoverse N1: 1.155x
Neoverse N2: 1.014x
Neoverse V1: 1.165x
Neoverse V2: 1.090x
For N2, V1 and V2 this implementation will be superseded by a Neon I8MM
implementation in a subsequent patch.
---
source/common/CMakeLists.txt | 7 +
source/common/aarch64/asm-primitives.cpp | 7 +
source/common/aarch64/filter-neon-dotprod.cpp | 235 ++++++++++++++++++
source/common/aarch64/filter-neon-dotprod.h | 37 +++
4 files changed, 286 insertions(+)
create mode 100644 source/common/aarch64/filter-neon-dotprod.cpp
create mode 100644 source/common/aarch64/filter-neon-dotprod.h
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index f998c4d92..f8167121e 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -104,6 +104,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
endif()
set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h)
+ set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
set(C_SRCS_SVE2 sao-prim-sve2.cpp)
enable_language(ASM)
@@ -123,6 +124,12 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
endforeach()
+ if(CPU_HAS_NEON_DOTPROD)
+ foreach(SRC ${C_SRCS_NEON_DOTPROD})
+ set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+ endforeach()
+ endif()
+
if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE)
foreach(SRC ${C_SRCS_SVE})
set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 7fd29bba1..e67901ca2 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -384,6 +384,7 @@ extern "C" {
#include "loopfilter-prim.h"
#include "intrapred-prim.h"
#include "sao-prim.h"
+#include "filter-neon-dotprod.h"
namespace X265_NS
{
@@ -1039,6 +1040,12 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
setupIntraPrimitives_neon(p);
setupSaoPrimitives_neon(p);
}
+#ifdef HAVE_NEON_DOTPROD
+ if (cpuMask & X265_CPU_NEON_DOTPROD)
+ {
+ setupFilterPrimitives_neon_dotprod(p);
+ }
+#endif
#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
if (cpuMask & X265_CPU_SVE)
{
diff --git a/source/common/aarch64/filter-neon-dotprod.cpp b/source/common/aarch64/filter-neon-dotprod.cpp
new file mode 100644
index 000000000..31269d300
--- /dev/null
+++ b/source/common/aarch64/filter-neon-dotprod.cpp
@@ -0,0 +1,235 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "filter-neon-dotprod.h"
+
+#if !HIGH_BIT_DEPTH
+#include "mem-neon.h"
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl[48] = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
+ 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t constant, const uint8x16x3_t tbl)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x16_t samples_s8 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val[2]);
+
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+ // Narrow and combine.
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+ vmovn_s32(dotprod_hi));
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+ int8x16_t *d)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x8_t samples_s8[4];
+ samples_s8[0] = vreinterpret_s8_u8(vsub_u8(samples[0], vdup_n_u8(128)));
+ samples_s8[1] = vreinterpret_s8_u8(vsub_u8(samples[1], vdup_n_u8(128)));
+ samples_s8[2] = vreinterpret_s8_u8(vsub_u8(samples[2], vdup_n_u8(128)));
+ samples_s8[3] = vreinterpret_s8_u8(vsub_u8(samples[3], vdup_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ d[0] = vqtbl1q_s8(vcombine_s8(samples_s8[0], vdup_n_s8(0)), tbl.val[0]);
+ d[1] = vqtbl1q_s8(vcombine_s8(samples_s8[1], vdup_n_s8(0)), tbl.val[0]);
+ d[2] = vqtbl1q_s8(vcombine_s8(samples_s8[2], vdup_n_s8(0)), tbl.val[0]);
+ d[3] = vqtbl1q_s8(vcombine_s8(samples_s8[3], vdup_n_s8(0)), tbl.val[0]);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+ const int32x4_t constant,
+ const uint8x16x3_t tbl,
+ int8x16_t &perm_samples_0)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x16_t samples_s8 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // Already in perm_samples_0.
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val[2]);
+
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+ // Save for re-use in next iteration.
+ perm_samples_0 = perm_samples_2;
+
+ // Narrow and combine.
+ int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+ vmovn_s32(dotprod_hi));
+ return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+} // Unnamed namespace.
+
+namespace X265_NS {
+template<int width, int height>
+void interp8_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride,
+ uint8_t *dst, intptr_t dstStride, int coeffIdx)
+{
+ const int N_TAPS = 8;
+
+ src -= N_TAPS / 2 - 1;
+
+ const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl);
+ const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFilter[coeffIdx]));
+ // Correction accounting for sample range transform from uint8_t to int8_t.
+ const int32x4_t c = vdupq_n_s32(64 * 128);
+
+ int row;
+ for (row = 0; row < height; row += 4)
+ {
+ int col = 0;
+ if (width >= 32)
+ {
+ // Peel first sample permute to enable passing between iterations.
+ uint8x8_t s0[4];
+ load_u8x8xn<4>(src, srcStride, s0);
+ int8x16_t ps0[4];
+ init_sample_permute(s0, tbl, ps0);
+
+ for (; col + 16 <= width; col += 16)
+ {
+ uint8x16_t s_lo[4], s_hi[4];
+ load_u8x16xn<4>(src + col + 0, srcStride, s_lo);
+ load_u8x16xn<4>(src + col + 8, srcStride, s_hi);
+
+ uint8x8_t d_lo[4];
+ d_lo[0] = filter8_8_pp_reuse(s_lo[0], filter, c, tbl, ps0[0]);
+ d_lo[1] = filter8_8_pp_reuse(s_lo[1], filter, c, tbl, ps0[1]);
+ d_lo[2] = filter8_8_pp_reuse(s_lo[2], filter, c, tbl, ps0[2]);
+ d_lo[3] = filter8_8_pp_reuse(s_lo[3], filter, c, tbl, ps0[3]);
+
+ uint8x8_t d_hi[4];
+ d_hi[0] = filter8_8_pp_reuse(s_hi[0], filter, c, tbl, ps0[0]);
+ d_hi[1] = filter8_8_pp_reuse(s_hi[1], filter, c, tbl, ps0[1]);
+ d_hi[2] = filter8_8_pp_reuse(s_hi[2], filter, c, tbl, ps0[2]);
+ d_hi[3] = filter8_8_pp_reuse(s_hi[3], filter, c, tbl, ps0[3]);
+
+ store_u8x8xn<4>(dst + col + 0, dstStride, d_lo);
+ store_u8x8xn<4>(dst + col + 8, dstStride, d_hi);
+ }
+ }
+ else
+ {
+ for (; col + 8 <= width; col += 8)
+ {
+ uint8x16_t s[4];
+ load_u8x16xn<4>(src + col, srcStride, s);
+
+ uint8x8_t d[4];
+ d[0] = filter8_8_pp(s[0], filter, c, tbl);
+ d[1] = filter8_8_pp(s[1], filter, c, tbl);
+ d[2] = filter8_8_pp(s[2], filter, c, tbl);
+ d[3] = filter8_8_pp(s[3], filter, c, tbl);
+
+ store_u8x8xn<4>(dst + col, dstStride, d);
+ }
+ }
+ for (; col < width; col += 4)
+ {
+ uint8x16_t s[4];
+ load_u8x16xn<4>(src + col, srcStride, s);
+
+ uint8x8_t d[4];
+ d[0] = filter8_8_pp(s[0], filter, c, tbl);
+ d[1] = filter8_8_pp(s[1], filter, c, tbl);
+ d[2] = filter8_8_pp(s[2], filter, c, tbl);
+ d[3] = filter8_8_pp(s[3], filter, c, tbl);
+
+ store_u8x4xn<4>(dst + col, dstStride, d);
+ }
+
+ src += 4 * srcStride;
+ dst += 4 * dstStride;
+ }
+}
+
+#define LUMA_DOTPROD(W, H) \
+ p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_dotprod<W, H>;
+
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p)
+{
+ LUMA_DOTPROD(4, 4);
+ LUMA_DOTPROD(4, 8);
+ LUMA_DOTPROD(4, 16);
+ LUMA_DOTPROD(12, 16);
+ LUMA_DOTPROD(8, 4);
+ LUMA_DOTPROD(8, 8);
+ LUMA_DOTPROD(8, 16);
+ LUMA_DOTPROD(8, 32);
+ LUMA_DOTPROD(16, 4);
+ LUMA_DOTPROD(16, 8);
+ LUMA_DOTPROD(16, 12);
+ LUMA_DOTPROD(16, 16);
+ LUMA_DOTPROD(16, 32);
+ LUMA_DOTPROD(16, 64);
+ LUMA_DOTPROD(24, 32);
+ LUMA_DOTPROD(32, 8);
+ LUMA_DOTPROD(32, 16);
+ LUMA_DOTPROD(32, 24);
+ LUMA_DOTPROD(32, 32);
+ LUMA_DOTPROD(32, 64);
+ LUMA_DOTPROD(48, 64);
+ LUMA_DOTPROD(64, 16);
+ LUMA_DOTPROD(64, 32);
+ LUMA_DOTPROD(64, 48);
+ LUMA_DOTPROD(64, 64);
+}
+}
+
+#else // !HIGH_BIT_DEPTH
+namespace X265_NS {
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &)
+{
+}
+}
+#endif // !HIGH_BIT_DEPTH
diff --git a/source/common/aarch64/filter-neon-dotprod.h b/source/common/aarch64/filter-neon-dotprod.h
new file mode 100644
index 000000000..c4e541998
--- /dev/null
+++ b/source/common/aarch64/filter-neon-dotprod.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+#define X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
+
+#if defined(HAVE_NEON_DOTPROD)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_DOTPROD)
+
+#endif // X265_COMMON_AARCH64_FILTER_NEON_DOTPROD_H
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Add-Armv8.4-Neon-DotProd-implementations-of-.patch
Type: text/x-patch
Size: 14567 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240906/9437f7ee/attachment-0001.bin>
More information about the x265-devel
mailing list