[x265] [PATCH 04/12] AArch64: Support all block sizes in p2s Neon
Hari Limaye
hari.limaye at arm.com
Fri Aug 30 19:19:25 UTC 2024
The Neon intrinsics-based optimised p2s implementation currently only
supports block sizes with widths that are a multiple of 8. This patch
adds support for all LUMA and CHROMA block sizes.
---
source/common/aarch64/filter-prim.cpp | 36 ++++++++++++++++++++++++---
source/common/aarch64/mem-neon.h | 11 ++++++++
2 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 1bccb9f86..adaae2132 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,6 +1,8 @@
#if HAVE_NEON
#include "filter-prim.h"
+#include "mem-neon.h"
+
#include <arm_neon.h>
namespace X265_NS
@@ -14,12 +16,12 @@ template<int width, int height>
void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
{
const int shift = IF_INTERNAL_PREC - X265_DEPTH;
- int row, col;
const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
- for (row = 0; row < height; row++)
+ for (int row = 0; row < height; row++)
{
- for (col = 0; col < width; col += 8)
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
{
uint16x8_t in;
@@ -33,6 +35,34 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
tmp = vsubq_s16(tmp, off);
vst1q_s16(dst + col, tmp);
}
+ for (; col + 4 <= width; col += 4)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ vst1_s16(dst + col, tmp);
+ }
+ for (; col < width; col += 2)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ store_s16x2xn<1>(dst + col, dstStride, &tmp);
+ }
src += srcStride;
dst += dstStride;
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index bee31a966..197017269 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -56,4 +56,15 @@ static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
}
+template<int N>
+static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
+ const int16x4_t *src)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ vst1_lane_s32((int32_t*)dst, vreinterpret_s32_s16(src[i]), 0);
+ dst += dst_stride;
+ }
+}
+
#endif // X265_COMMON_AARCH64_MEM_NEON_H
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Support-all-block-sizes-in-p2s-Neon.patch
Type: text/x-patch
Size: 3420 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240830/aa49c47d/attachment-0001.bin>
More information about the x265-devel
mailing list