[x265] [PATCH 08/14] AArch64: Add Armv8.6 Neon I8MM implementations of luma_hpp

Fri Sep 6 13:34:46 UTC 2024

Add implementations of luma_hpp primitives using Neon I8MM instructions,
which are mandatory from Armv8.6.

Luma filters 1 and 3 are actually 7-tap filters 0-padded to 8 taps. We
can use this fact to accelerate these cases using the Armv8.6 USMMLA
matrix multiply instructions - which do twice as much work as the
equivalent USDOT dot product instructions.

Geomean uplift across all block sizes for luma filters, relative to
Armv8.4 Neon DotProd implementations:

    Neoverse N2: 1.481x
    Neoverse V1: 1.337x
    Neoverse V2: 1.399x
---
 source/common/CMakeLists.txt               |   7 +
 source/common/aarch64/asm-primitives.cpp   |   7 +
 source/common/aarch64/filter-neon-i8mm.cpp | 341 +++++++++++++++++++++
 source/common/aarch64/filter-neon-i8mm.h   |  37 +++
 4 files changed, 392 insertions(+)
 create mode 100644 source/common/aarch64/filter-neon-i8mm.cpp
 create mode 100644 source/common/aarch64/filter-neon-i8mm.h

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index f8167121e..4b7145132 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -105,6 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
 
     set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp  mem-neon.h)
     set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp)
+    set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp)
     set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp)
     set(C_SRCS_SVE2 sao-prim-sve2.cpp)
     enable_language(ASM)
@@ -124,6 +125,12 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
         set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
     endforeach()
 
+    if(CPU_HAS_NEON_I8MM)
+        foreach(SRC ${C_SRCS_NEON_I8MM})
+            set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
+        endforeach()
+    endif()
+
     if(CPU_HAS_NEON_DOTPROD)
         foreach(SRC ${C_SRCS_NEON_DOTPROD})
             set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC})
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index e67901ca2..dd3c2a4ba 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -385,6 +385,7 @@ extern "C" {
 #include "intrapred-prim.h"
 #include "sao-prim.h"
 #include "filter-neon-dotprod.h"
+#include "filter-neon-i8mm.h"
 
 namespace X265_NS
 {
@@ -1046,6 +1047,12 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
         setupFilterPrimitives_neon_dotprod(p);
     }
 #endif
+#ifdef HAVE_NEON_I8MM
+    if (cpuMask & X265_CPU_NEON_I8MM)
+    {
+        setupFilterPrimitives_neon_i8mm(p);
+    }
+#endif
 #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE
     if (cpuMask & X265_CPU_SVE)
     {
diff --git a/source/common/aarch64/filter-neon-i8mm.cpp b/source/common/aarch64/filter-neon-i8mm.cpp
new file mode 100644
index 000000000..c19592fa1
--- /dev/null
+++ b/source/common/aarch64/filter-neon-i8mm.cpp
@@ -0,0 +1,341 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#if defined(HAVE_NEON_I8MM)
+#include "filter-neon-i8mm.h"
+#if !HIGH_BIT_DEPTH
+
+#include "mem-neon.h"
+
+#include <arm_neon.h>
+
+namespace {
+static const uint8_t dotprod_permute_tbl[48] = {
+    0, 1,  2,  3, 1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5, 6,
+    4, 5,  6,  7, 5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10,
+    8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+static const uint8_t matmul_permute_tbl[2][32] = {
+    // Permute for luma filter 3.
+    { 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9,
+      4,  5,  6,  7,  8,  9, 10, 11,  6,  7,  8,  9, 10, 11, 12, 13 },
+    // Permute for luma filter 1.
+    { 1,  2,  3,  4,  5,  6,  7,  8,  3,  4,  5,  6,  7,  8,  9, 10,
+      5,  6,  7,  8,  9, 10, 11, 12,  7,  8,  9, 10, 11, 12, 13, 14 }
+};
+
+static const int8_t matmul_luma_filter[2][16] = {
+    { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 },
+    { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 }
+};
+
+uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const uint8x16x3_t tbl)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]);
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val[2]);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl,
+                                uint8x16_t *d)
+{
+    // Permute input samples for dot product.
+    // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+    d[0] = vqtbl1q_u8(vcombine_u8(samples[0], vdup_n_u8(0)), tbl.val[0]);
+    d[1] = vqtbl1q_u8(vcombine_u8(samples[1], vdup_n_u8(0)), tbl.val[0]);
+    d[2] = vqtbl1q_u8(vcombine_u8(samples[2], vdup_n_u8(0)), tbl.val[0]);
+    d[3] = vqtbl1q_u8(vcombine_u8(samples[3], vdup_n_u8(0)), tbl.val[0]);
+}
+
+uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
+                                    const uint8x16x3_t tbl, uint8x16_t &perm_s0)
+{
+    // Permute input samples for dot product.
+    // { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 }
+    // Already in perm_s0.
+    // { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 }
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]);
+    // { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+    uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val[2]);
+
+    int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0);
+    dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1);
+    int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0);
+    dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1);
+
+    // Save for re-use in next iteration.
+    perm_s0 = perm_s2;
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
+
+uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter,
+                                     const uint8x16x2_t tbl)
+{
+    // Permute input samples for 8x2 by 2x8 matrix multiply.
+    uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]);
+    uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]);
+
+    int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter);
+    int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter);
+
+    // Narrow and combine.
+    int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi));
+    return vqrshrun_n_s16(matmul, IF_FILTER_PREC);
+}
+} // Unnamed namespace.
+
+namespace X265_NS {
+template<int width, int height>
+void inline interp8_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride,
+                                     uint8_t *dst, intptr_t dstStride,
+                                     int coeffIdx)
+{
+    const int N_TAPS = 8;
+    src -= N_TAPS / 2 - 1;
+
+    const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl);
+    const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFilter[coeffIdx]));
+
+    for (int row = 0; row < height; row += 4)
+    {
+        int col = 0;
+        if (width >= 32)
+        {
+            // Peel first sample permute to enable passing between iterations.
+            uint8x8_t s0[4];
+            load_u8x8xn<4>(src, srcStride, s0);
+            uint8x16_t ps0[4];
+            init_sample_permute(s0, tbl, ps0);
+
+            for (; (col + 16) <= width; col += 16)
+            {
+                uint8x16_t s_lo[4], s_hi[4];
+                load_u8x16xn<4>(src + col + 0, srcStride, s_lo);
+                load_u8x16xn<4>(src + col + 8, srcStride, s_hi);
+
+                uint8x8_t d_lo[4];
+                d_lo[0] = filter8_8_pp_reuse(s_lo[0], filter, tbl, ps0[0]);
+                d_lo[1] = filter8_8_pp_reuse(s_lo[1], filter, tbl, ps0[1]);
+                d_lo[2] = filter8_8_pp_reuse(s_lo[2], filter, tbl, ps0[2]);
+                d_lo[3] = filter8_8_pp_reuse(s_lo[3], filter, tbl, ps0[3]);
+
+                uint8x8_t d_hi[4];
+                d_hi[0] = filter8_8_pp_reuse(s_hi[0], filter, tbl, ps0[0]);
+                d_hi[1] = filter8_8_pp_reuse(s_hi[1], filter, tbl, ps0[1]);
+                d_hi[2] = filter8_8_pp_reuse(s_hi[2], filter, tbl, ps0[2]);
+                d_hi[3] = filter8_8_pp_reuse(s_hi[3], filter, tbl, ps0[3]);
+
+                store_u8x8xn<4>(dst + col + 0, dstStride, d_lo);
+                store_u8x8xn<4>(dst + col + 8, dstStride, d_hi);
+            }
+        }
+        else
+        {
+            for (; col + 8 <= width; col += 8)
+            {
+                uint8x16_t s[4];
+                load_u8x16xn<4>(src + col, srcStride, s);
+
+                uint8x8_t d[4];
+                d[0] = filter8_8_pp(s[0], filter, tbl);
+                d[1] = filter8_8_pp(s[1], filter, tbl);
+                d[2] = filter8_8_pp(s[2], filter, tbl);
+                d[3] = filter8_8_pp(s[3], filter, tbl);
+
+                store_u8x8xn<4>(dst + col, dstStride, d);
+            }
+        }
+        for (; col < width; col += 4)
+        {
+            uint8x16_t s[4];
+            load_u8x16xn<4>(src + col, srcStride, s);
+
+            uint8x8_t d[4];
+            d[0] = filter8_8_pp(s[0], filter, tbl);
+            d[1] = filter8_8_pp(s[1], filter, tbl);
+            d[2] = filter8_8_pp(s[2], filter, tbl);
+            d[3] = filter8_8_pp(s[3], filter, tbl);
+
+            store_u8x4xn<4>(dst + col, dstStride, d);
+        }
+
+        src += 4 * srcStride;
+        dst += 4 * dstStride;
+    }
+}
+
+template<int coeffIdx, int width, int height>
+void inline interp8_horiz_pp_matmul(const uint8_t *src, intptr_t srcStride,
+                                    uint8_t *dst, intptr_t dstStride)
+{
+    const int N_TAPS = 8;
+    src -= N_TAPS / 2 - 1;
+
+    // coeffIdx is 1 or 3 for g_lumaFilter index.
+    // Select filter and permute table from the first or second array indices.
+    const int index = coeffIdx >> 1;
+    const uint8x16x2_t tbl = vld1q_u8_x2(matmul_permute_tbl[index]);
+    const int8x16_t filter = vld1q_s8(matmul_luma_filter[index]);
+
+    for (int row = 0; row < height; row += 4)
+    {
+        int col = 0;
+        if (width >= 32)
+        {
+            for (; (col + 16) <= width; col += 16)
+            {
+                uint8x16_t s_lo[4], s_hi[4];
+                load_u8x16xn<4>(src + col + 0, srcStride, s_lo);
+                load_u8x16xn<4>(src + col + 8, srcStride, s_hi);
+
+                uint8x8_t d_lo[4];
+                d_lo[0] = filter8_8_pp_matmul(s_lo[0], filter, tbl);
+                d_lo[1] = filter8_8_pp_matmul(s_lo[1], filter, tbl);
+                d_lo[2] = filter8_8_pp_matmul(s_lo[2], filter, tbl);
+                d_lo[3] = filter8_8_pp_matmul(s_lo[3], filter, tbl);
+
+                uint8x8_t d_hi[4];
+                d_hi[0] = filter8_8_pp_matmul(s_hi[0], filter, tbl);
+                d_hi[1] = filter8_8_pp_matmul(s_hi[1], filter, tbl);
+                d_hi[2] = filter8_8_pp_matmul(s_hi[2], filter, tbl);
+                d_hi[3] = filter8_8_pp_matmul(s_hi[3], filter, tbl);
+
+                store_u8x8xn<4>(dst + col + 0, dstStride, d_lo);
+                store_u8x8xn<4>(dst + col + 8, dstStride, d_hi);
+            }
+        }
+        else
+        {
+            for (; col + 8 <= width; col += 8)
+            {
+                uint8x16_t s[4];
+                load_u8x16xn<4>(src + col, srcStride, s);
+
+                uint8x8_t d[4];
+                d[0] = filter8_8_pp_matmul(s[0], filter, tbl);
+                d[1] = filter8_8_pp_matmul(s[1], filter, tbl);
+                d[2] = filter8_8_pp_matmul(s[2], filter, tbl);
+                d[3] = filter8_8_pp_matmul(s[3], filter, tbl);
+
+                store_u8x8xn<4>(dst + col, dstStride, d);
+            }
+        }
+        for (; col < width; col += 4)
+        {
+            uint8x16_t s[4];
+            load_u8x16xn<4>(src + col, srcStride, s);
+
+            uint8x8_t d[4];
+            d[0] = filter8_8_pp_matmul(s[0], filter, tbl);
+            d[1] = filter8_8_pp_matmul(s[1], filter, tbl);
+            d[2] = filter8_8_pp_matmul(s[2], filter, tbl);
+            d[3] = filter8_8_pp_matmul(s[3], filter, tbl);
+
+            store_u8x4xn<4>(dst + col, dstStride, d);
+        }
+
+        src += 4 * srcStride;
+        dst += 4 * dstStride;
+    }
+}
+
+template<int width, int height>
+void interp8_horiz_pp_i8mm(const uint8_t *src, intptr_t srcStride, uint8_t *dst,
+                           intptr_t dstStride, int coeffIdx)
+{
+    switch (coeffIdx)
+    {
+    case 1:
+        return interp8_horiz_pp_matmul<1, width, height>(src, srcStride, dst,
+                                                         dstStride);
+    case 2:
+        return interp8_horiz_pp_dotprod<width, height>(src, srcStride, dst,
+                                                       dstStride, coeffIdx);
+    case 3:
+        return interp8_horiz_pp_matmul<3, width, height>(src, srcStride, dst,
+                                                         dstStride);
+    }
+}
+
+#define LUMA_I8MM(W, H) \
+        p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_i8mm<W, H>;
+
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p)
+{
+    LUMA_I8MM(4, 4);
+    LUMA_I8MM(4, 8);
+    LUMA_I8MM(4, 16);
+    LUMA_I8MM(12, 16);
+    LUMA_I8MM(8, 4);
+    LUMA_I8MM(8, 8);
+    LUMA_I8MM(8, 16);
+    LUMA_I8MM(8, 32);
+    LUMA_I8MM(16, 4);
+    LUMA_I8MM(16, 8);
+    LUMA_I8MM(16, 12);
+    LUMA_I8MM(16, 16);
+    LUMA_I8MM(16, 32);
+    LUMA_I8MM(16, 64);
+    LUMA_I8MM(24, 32);
+    LUMA_I8MM(32, 8);
+    LUMA_I8MM(32, 16);
+    LUMA_I8MM(32, 24);
+    LUMA_I8MM(32, 32);
+    LUMA_I8MM(32, 64);
+    LUMA_I8MM(48, 64);
+    LUMA_I8MM(64, 16);
+    LUMA_I8MM(64, 32);
+    LUMA_I8MM(64, 48);
+    LUMA_I8MM(64, 64);
+}
+}
+
+#else // if !HIGH_BIT_DEPTH
+namespace X265_NS {
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &)
+{
+}
+}
+#endif // !HIGH_BIT_DEPTH
+
+#endif // defined(HAVE_NEON_I8MM)
diff --git a/source/common/aarch64/filter-neon-i8mm.h b/source/common/aarch64/filter-neon-i8mm.h
new file mode 100644
index 000000000..aa9cd8225
--- /dev/null
+++ b/source/common/aarch64/filter-neon-i8mm.h
@@ -0,0 +1,37 @@
+/*****************************************************************************
+ * Copyright (C) 2024 MulticoreWare, Inc
+ *
+ * Authors: Hari Limaye <hari.limaye at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FILTER_NEON_I8MM_H
+#define X265_FILTER_NEON_I8MM_H
+
+#if defined(HAVE_NEON_I8MM)
+
+#include "primitives.h"
+
+namespace X265_NS {
+void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p);
+}
+
+#endif // defined(HAVE_NEON_I8MM)
+
+#endif // X265_FILTER_NEON_I8MM_H
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0008-AArch64-Add-Armv8.6-Neon-I8MM-implementations-of-lum.patch
Type: text/x-patch
Size: 18232 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240906/ca792f37/attachment-0001.bin>