[x265] [PATCH 04/14] AArch64: Add Armv8.4 Neon DotProd implementations of filter_hpp

Fri Sep 6 13:33:24 UTC 2024

Add implementations of filter_hpp primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.

Geomean uplift across all block sizes for chroma filters, relative to
Armv8.0 Neon implementation:

    Neoverse N1: 1.123x
    Neoverse N2: 0.921x
    Neoverse V1: 1.021x
    Neoverse V2: 0.995x

For N2, V1 and V2 this implementation will be superseded by a Neon I8MM
implementation in a subsequent patch.
---
 source/common/aarch64/filter-neon-dotprod.cpp | 212 ++++++++++++++++++
 1 file changed, 212 insertions(+)

diff --git a/source/common/aarch64/filter-neon-dotprod.cpp b/source/common/aarch64/filter-neon-dotprod.cpp
index 4cd052fb0..8247dc29a 100644
--- a/source/common/aarch64/filter-neon-dotprod.cpp
+++ b/source/common/aarch64/filter-neon-dotprod.cpp
@@ -186,6 +186,28 @@ int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
     // Narrow and combine.
     return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
 }
+
+uint8x8_t inline filter4_8_pp(uint8x16_t samples, const int8x8_t filter,
+                              const int32x4_t constant, const uint8x16x2_t tbl)
+{
+    // Transform sample range from uint8_t to int8_t for signed dot product.
+    int8x16_t samples_s8 =
+        vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+    // Permute input samples for dot product.
+    // {0,  1,  2,  3 ,  1,  2,  3,  4 ,  2,  3,  4,  5 ,  3,  4,  5,  6}
+    int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val[0]);
+    // {4,  5,  6,  7 ,  5,  6,  7,  8 ,  6,  7,  8,  9 ,  7,  8,  9, 10}
+    int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+
+    int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+    int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+
+    // Narrow and combine.
+    int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo),
+                                     vmovn_s32(dotprod_hi));
+    return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
+}
 } // Unnamed namespace.
 
 namespace X265_NS {
@@ -388,10 +410,124 @@ void interp8_horiz_ps_dotprod(const uint8_t *src, intptr_t srcStride,
     }
 }
 
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+                          intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp4_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride,
+                              uint8_t *dst, intptr_t dstStride, int coeffIdx)
+{
+    const int N_TAPS = 4;
+
+    if (coeffIdx == 4)
+        return interp_horiz_pp_neon<N_TAPS, width, height>(src, srcStride, dst,
+                                                           dstStride, coeffIdx);
+
+    src -= N_TAPS / 2 - 1;
+
+    const uint8x16x2_t tbl = vld1q_u8_x2(dotprod_permute_tbl);
+    const int16x4_t filter_16 = vld1_s16(g_chromaFilter[coeffIdx]);
+    const int8x8_t filter = vmovn_s16(vcombine_s16(filter_16, vdup_n_s16(0)));
+
+    // Correction accounting for sample range transform from uint8_t to int8_t.
+    const int32x4_t c = vdupq_n_s32(64 * 128);
+
+    for (int row = 0; row + 4 <= height; row += 4)
+    {
+        int col = 0;
+        for (; col + 16 <= width; col += 16)
+        {
+            uint8x16_t s0[4], s1[4];
+            load_u8x16xn<4>(src + col, srcStride, s0);
+            load_u8x16xn<4>(src + col + 8, srcStride, s1);
+
+            uint8x8_t d_lo[4];
+            d_lo[0] = filter4_8_pp(s0[0], filter, c, tbl);
+            d_lo[1] = filter4_8_pp(s0[1], filter, c, tbl);
+            d_lo[2] = filter4_8_pp(s0[2], filter, c, tbl);
+            d_lo[3] = filter4_8_pp(s0[3], filter, c, tbl);
+
+            uint8x8_t d_hi[4];
+            d_hi[0] = filter4_8_pp(s1[0], filter, c, tbl);
+            d_hi[1] = filter4_8_pp(s1[1], filter, c, tbl);
+            d_hi[2] = filter4_8_pp(s1[2], filter, c, tbl);
+            d_hi[3] = filter4_8_pp(s1[3], filter, c, tbl);
+
+            uint8x16_t d[4];
+            d[0] = vcombine_u8(d_lo[0], d_hi[0]);
+            d[1] = vcombine_u8(d_lo[1], d_hi[1]);
+            d[2] = vcombine_u8(d_lo[2], d_hi[2]);
+            d[3] = vcombine_u8(d_lo[3], d_hi[3]);
+
+            store_u8x16xn<4>(dst + col, dstStride, d);
+        }
+
+        for (; col + 8 <= width; col += 8)
+        {
+            uint8x16_t s[4];
+            load_u8x16xn<4>(src + col, srcStride, s);
+
+            uint8x8_t d[4];
+            d[0] = filter4_8_pp(s[0], filter, c, tbl);
+            d[1] = filter4_8_pp(s[1], filter, c, tbl);
+            d[2] = filter4_8_pp(s[2], filter, c, tbl);
+            d[3] = filter4_8_pp(s[3], filter, c, tbl);
+
+            store_u8x8xn<4>(dst + col, dstStride, d);
+        }
+
+        // Block sizes 12xH, 6xH, 4xH, 2xH.
+        if (width % 8 != 0)
+        {
+            uint8x16_t s[4];
+            load_u8x16xn<4>(src + col, srcStride, s);
+
+            uint8x8_t d[4];
+            d[0] = filter4_8_pp(s[0], filter, c, tbl);
+            d[1] = filter4_8_pp(s[1], filter, c, tbl);
+            d[2] = filter4_8_pp(s[2], filter, c, tbl);
+            d[3] = filter4_8_pp(s[3], filter, c, tbl);
+
+            const int n_store = width < 8 ? width : 4;
+            store_u8xnxm<n_store, 4>(dst + col, dstStride, d);
+        }
+
+        src += 4 * srcStride;
+        dst += 4 * dstStride;
+    }
+
+    // Block sizes 8x6, 8x2, 4x2.
+    if (height & 2)
+    {
+        uint8x16_t s[2];
+        load_u8x16xn<2>(src, srcStride, s);
+
+        uint8x8_t d[2];
+        d[0] = filter4_8_pp(s[0], filter, c, tbl);
+        d[1] = filter4_8_pp(s[1], filter, c, tbl);
+
+        const int n_store = width < 8 ? width : 8;
+        store_u8xnxm<n_store, 2>(dst, dstStride, d);
+    }
+}
+
 #define LUMA_DOTPROD(W, H) \
         p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_dotprod<W, H>; \
         p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_dotprod<W, H>;
 
+#define CHROMA_420_DOTPROD(W, H) \
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = \
+            interp4_horiz_pp_dotprod<W, H>;
+
+#define CHROMA_422_DOTPROD(W, H) \
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].filter_hpp = \
+            interp4_horiz_pp_dotprod<W, H>;
+
+#define CHROMA_444_DOTPROD(W, H) \
+        p.chroma[X265_CSP_I444].pu[LUMA_ ## W ## x ## H].filter_hpp = \
+            interp4_horiz_pp_dotprod<W, H>;
+
 void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p)
 {
     LUMA_DOTPROD(4, 4);
@@ -419,6 +555,82 @@ void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p)
     LUMA_DOTPROD(64, 32);
     LUMA_DOTPROD(64, 48);
     LUMA_DOTPROD(64, 64);
+
+    CHROMA_420_DOTPROD(2, 4);
+    CHROMA_420_DOTPROD(2, 8);
+    CHROMA_420_DOTPROD(4, 2);
+    CHROMA_420_DOTPROD(4, 4);
+    CHROMA_420_DOTPROD(4, 8);
+    CHROMA_420_DOTPROD(4, 16);
+    CHROMA_420_DOTPROD(6, 8);
+    CHROMA_420_DOTPROD(12, 16);
+    CHROMA_420_DOTPROD(8, 2);
+    CHROMA_420_DOTPROD(8, 4);
+    CHROMA_420_DOTPROD(8, 6);
+    CHROMA_420_DOTPROD(8, 8);
+    CHROMA_420_DOTPROD(8, 16);
+    CHROMA_420_DOTPROD(8, 32);
+    CHROMA_420_DOTPROD(16, 4);
+    CHROMA_420_DOTPROD(16, 8);
+    CHROMA_420_DOTPROD(16, 12);
+    CHROMA_420_DOTPROD(16, 16);
+    CHROMA_420_DOTPROD(16, 32);
+    CHROMA_420_DOTPROD(24, 32);
+    CHROMA_420_DOTPROD(32, 8);
+    CHROMA_420_DOTPROD(32, 16);
+    CHROMA_420_DOTPROD(32, 24);
+    CHROMA_420_DOTPROD(32, 32);
+
+    CHROMA_422_DOTPROD(2, 8);
+    CHROMA_422_DOTPROD(2, 16);
+    CHROMA_422_DOTPROD(4, 4);
+    CHROMA_422_DOTPROD(4, 8);
+    CHROMA_422_DOTPROD(4, 16);
+    CHROMA_422_DOTPROD(4, 32);
+    CHROMA_422_DOTPROD(6, 16);
+    CHROMA_422_DOTPROD(12, 32);
+    CHROMA_422_DOTPROD(8, 4);
+    CHROMA_422_DOTPROD(8, 8);
+    CHROMA_422_DOTPROD(8, 12);
+    CHROMA_422_DOTPROD(8, 16);
+    CHROMA_422_DOTPROD(8, 32);
+    CHROMA_422_DOTPROD(8, 64);
+    CHROMA_422_DOTPROD(16, 8);
+    CHROMA_422_DOTPROD(16, 16);
+    CHROMA_422_DOTPROD(16, 24);
+    CHROMA_422_DOTPROD(16, 32);
+    CHROMA_422_DOTPROD(16, 64);
+    CHROMA_422_DOTPROD(24, 64);
+    CHROMA_422_DOTPROD(32, 16);
+    CHROMA_422_DOTPROD(32, 32);
+    CHROMA_422_DOTPROD(32, 48);
+    CHROMA_422_DOTPROD(32, 64);
+
+    CHROMA_444_DOTPROD(4, 4);
+    CHROMA_444_DOTPROD(4, 8);
+    CHROMA_444_DOTPROD(4, 16);
+    CHROMA_444_DOTPROD(12, 16);
+    CHROMA_444_DOTPROD(8, 4);
+    CHROMA_444_DOTPROD(8, 8);
+    CHROMA_444_DOTPROD(8, 16);
+    CHROMA_444_DOTPROD(8, 32);
+    CHROMA_444_DOTPROD(16, 4);
+    CHROMA_444_DOTPROD(16, 8);
+    CHROMA_444_DOTPROD(16, 12);
+    CHROMA_444_DOTPROD(16, 16);
+    CHROMA_444_DOTPROD(16, 32);
+    CHROMA_444_DOTPROD(16, 64);
+    CHROMA_444_DOTPROD(24, 32);
+    CHROMA_444_DOTPROD(32, 8);
+    CHROMA_444_DOTPROD(32, 16);
+    CHROMA_444_DOTPROD(32, 24);
+    CHROMA_444_DOTPROD(32, 32);
+    CHROMA_444_DOTPROD(32, 64);
+    CHROMA_444_DOTPROD(48, 64);
+    CHROMA_444_DOTPROD(64, 16);
+    CHROMA_444_DOTPROD(64, 32);
+    CHROMA_444_DOTPROD(64, 48);
+    CHROMA_444_DOTPROD(64, 64);
 }
 }
 
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Add-Armv8.4-Neon-DotProd-implementations-of-.patch
Type: text/x-patch
Size: 9505 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240906/ce97c79f/attachment-0001.bin>