[x265] [PATCH 09/18] AArch64: Use proper load/store intrinsics in filter-prim.cpp
Hari Limaye
hari.limaye at arm.com
Tue Aug 13 15:20:30 UTC 2024
Use proper load/store intrinsics instead of casts in
source/common/aarch64/filter-prim.cpp.
Refactoring to use explicit vector type conversions and optimisations to
reduce instructions are left to a later patch.
---
source/common/aarch64/filter-prim.cpp | 65 ++++++++++++---------------
1 file changed, 28 insertions(+), 37 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 986f5af6b..1898d362c 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -26,15 +26,14 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
int16x8_t in;
#if HIGH_BIT_DEPTH
- in = *(int16x8_t *)&src[col];
+ in = vld1q_u16(src + col);
#else
- in = vmovl_u8(*(uint8x8_t *)&src[col]);
+ in = vmovl_u8(vld1_u8(src + col));
#endif
int16x8_t tmp = vshlq_n_s16(in, shift);
tmp = vsubq_s16(tmp, off);
- *(int16x8_t *)&dst[col] = tmp;
-
+ vst1q_s16(dst + col, tmp);
}
src += srcStride;
@@ -53,8 +52,7 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
int cStride = 1;
src -= (N / 2 - 1) * cStride;
- int16x8_t vc;
- vc = *(int16x8_t *)coeff;
+ int16x8_t vc = vld1q_s16(coeff);
int16x4_t low_vc = vget_low_s16(vc);
int16x4_t high_vc = vget_high_s16(vc);
@@ -73,9 +71,9 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
for (int i = 0; i < N; i++)
{
#if HIGH_BIT_DEPTH
- input[i] = *(int16x8_t *)&src[col + i];
+ input[i] = vld1q_u16(src + col + i);
#else
- input[i] = vmovl_u8(*(uint8x8_t *)&src[col + i]);
+ input[i] = vmovl_u8(vld1_u8(src + col + i));
#endif
}
vsum1 = voffset;
@@ -113,10 +111,10 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
#if HIGH_BIT_DEPTH
- *(int16x8_t *)&dst[col] = vsum;
+ vst1q_u16(dst + col, vsum);
#else
uint8x16_t usum = vuzp1q_u8(vsum, vsum);
- *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
+ vst1_u8(dst + col, vget_low_u8(usum));
#endif
}
@@ -156,7 +154,7 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
int16x8_t input[N];
for (int i = 0; i < N; i++)
{
- input[i] = vld1q_s16((int16_t *)&src[col + i]);
+ input[i] = vld1q_u16(src + col + i);
}
vsum = voffset;
@@ -219,8 +217,7 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
src -= (N / 2 - 1) * srcStride;
blkheight += N - 1;
}
- int16x8_t vc;
- vc = *(int16x8_t *)coeff;
+ int16x8_t vc = vld1q_s16(coeff);
const int16x8_t voffset = vdupq_n_s16(offset);
const int16x8_t vhr = vdupq_n_s16(-shift);
@@ -236,7 +233,7 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(*(uint8x8_t *)&src[col + i]);
+ input[i] = vmovl_u8(vld1_u8(src + col + i));
}
vsum = voffset;
vsum = vmlaq_laneq_s16(vsum, (input[0]), vc, 0);
@@ -255,7 +252,7 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
}
vsum = vshlq_s16(vsum, vhr);
- *(int16x8_t *)&dst[col] = vsum;
+ vst1q_s16(dst + col, vsum);
}
src += srcStride;
@@ -272,8 +269,7 @@ void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, i
const int16_t *c = (N == 8 ? g_lumaFilter[coeffIdx] : g_chromaFilter[coeffIdx]);
int shift = IF_FILTER_PREC;
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)c;
+ int16x8_t vc = vld1q_s16(c);
int16x4_t low_vc = vget_low_s16(vc);
int16x4_t high_vc = vget_high_s16(vc);
@@ -290,7 +286,7 @@ void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, i
for (int i = 0; i < N; i++)
{
- input[i] = *(int16x8_t *)&src[col + i * srcStride];
+ input[i] = vld1q_s16(src + col + i * srcStride);
}
vsum1 = vmull_lane_s16(vget_low_s16(input[0]), low_vc, 0);
@@ -322,7 +318,7 @@ void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, i
vsum2 = vshlq_s32(vsum2, vhr);
int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
- *(int16x8_t *)&dst[col] = vsum;
+ vst1q_s16(dst + col, vsum);
}
src += srcStride;
@@ -343,8 +339,7 @@ void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst,
const uint16_t maxVal = (1 << X265_DEPTH) - 1;
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)c;
+ int16x8_t vc = vld1q_s16(c);
int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
@@ -361,7 +356,7 @@ void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst,
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u16(*(uint16x4_t *)&src[col + i * srcStride]);
+ input[i] = vmovl_u16(vld1_u16(src + col + i * srcStride));
}
vsum = voffset;
@@ -400,8 +395,7 @@ void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, i
int offset = 1 << (IF_FILTER_PREC - 1);
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)c;
+ int16x8_t vc = vld1q_s16(c);
const int16x8_t voffset = vdupq_n_s16(offset);
@@ -416,7 +410,7 @@ void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, i
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(*(uint8x8_t *)&src[col + i * srcStride]);
+ input[i] = vmovl_u8(vld1_u8(src + col + i * srcStride));
}
vsum = voffset;
@@ -455,8 +449,7 @@ void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
int offset = (unsigned) - IF_INTERNAL_OFFS << SHIFT_INTERP_PS;
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)c;
+ int16x8_t vc = vld1q_s16(c);
int32x4_t low_vc = vmovl_s16(vget_low_s16(vc));
int32x4_t high_vc = vmovl_s16(vget_high_s16(vc));
@@ -473,7 +466,7 @@ void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u16(*(uint16x4_t *)&src[col + i * srcStride]);
+ input[i] = vmovl_u16(vld1_u16(src + col + i * srcStride));
}
vsum = voffset;
@@ -510,8 +503,7 @@ void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, i
int offset = (unsigned) - IF_INTERNAL_OFFS << shift;
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)c;
+ int16x8_t vc = vld1q_s16(c);
const int16x8_t voffset = vdupq_n_s16(offset);
const int16x8_t vhr = vdupq_n_s16(-shift);
@@ -527,7 +519,7 @@ void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, i
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(*(uint8x8_t *)&src[col + i * srcStride]);
+ input[i] = vmovl_u8(vld1_u8(src + col + i * srcStride));
}
vsum = voffset;
@@ -546,7 +538,7 @@ void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, i
}
vsum = vshlq_s32(vsum, vhr);
- *(int16x8_t *)&dst[col] = vsum;
+ vst1q_s16(dst + col, vsum);
}
src += srcStride;
@@ -569,8 +561,7 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, int
src -= (N / 2 - 1) * srcStride;
- int16x8_t vc;
- vc = *(int16x8_t *)coeff;
+ int16x8_t vc = vld1q_s16(coeff);
int16x4_t low_vc = vget_low_s16(vc);
int16x4_t high_vc = vget_high_s16(vc);
@@ -588,7 +579,7 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, int
for (int i = 0; i < N; i++)
{
- input[i] = *(int16x8_t *)&src[col + i * srcStride];
+ input[i] = vld1q_s16(src + col + i * srcStride);
}
vsum1 = voffset;
vsum2 = voffset;
@@ -627,10 +618,10 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, int
vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
#if HIGH_BIT_DEPTH
- *(int16x8_t *)&dst[col] = vsum;
+ vst1q_u16(dst + col, vsum);
#else
uint8x16_t usum = vuzp1q_u8(vsum, vsum);
- *(uint8x8_t *)&dst[col] = vget_low_u8(usum);
+ vst1_u8(dst + col, vget_low_u8(usum));
#endif
}
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0009-AArch64-Use-proper-load-store-intrinsics-in-filter-p.patch
Type: text/x-patch
Size: 9835 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/a3c97119/attachment-0001.bin>
More information about the x265-devel
mailing list