[x265] [PATCH 04/12] AArch64: Support all block sizes in p2s Neon

Fri Aug 30 19:19:25 UTC 2024

The Neon intrinsics-based optimised p2s implementation currently only
supports block sizes with widths that are a multiple of 8. This patch
adds support for all LUMA and CHROMA block sizes.
---
 source/common/aarch64/filter-prim.cpp | 36 ++++++++++++++++++++++++---
 source/common/aarch64/mem-neon.h      | 11 ++++++++
 2 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 1bccb9f86..adaae2132 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,6 +1,8 @@
 #if HAVE_NEON
 
 #include "filter-prim.h"
+#include "mem-neon.h"
+
 #include <arm_neon.h>
 
 namespace X265_NS
@@ -14,12 +16,12 @@ template<int width, int height>
 void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
 {
     const int shift = IF_INTERNAL_PREC - X265_DEPTH;
-    int row, col;
     const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
-    for (row = 0; row < height; row++)
+    for (int row = 0; row < height; row++)
     {
 
-        for (col = 0; col < width; col += 8)
+        int col = 0;
+        for (; col + 8 <= width; col += 8)
         {
             uint16x8_t in;
 
@@ -33,6 +35,34 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
             tmp = vsubq_s16(tmp, off);
             vst1q_s16(dst + col, tmp);
         }
+        for (; col + 4 <= width; col += 4)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            vst1_s16(dst + col, tmp);
+        }
+        for (; col < width; col += 2)
+        {
+            uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+            in = vld1_u16(src + col);
+#else
+            in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+            int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+            tmp = vsub_s16(tmp, vget_low_s16(off));
+            store_s16x2xn<1>(dst + col, dstStride, &tmp);
+        }
 
         src += srcStride;
         dst += dstStride;
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index bee31a966..197017269 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -56,4 +56,15 @@ static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
     vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
 }
 
+template<int N>
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
+                                 const int16x4_t *src)
+{
+    for (int i = 0; i < N; ++i)
+    {
+        vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(src[i]), 0);
+        dst += dst_stride;
+    }
+}
+
 #endif // X265_COMMON_AARCH64_MEM_NEON_H
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Support-all-block-sizes-in-p2s-Neon.patch
Type: text/x-patch
Size: 3420 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240830/aa49c47d/attachment-0001.bin>