[x265] [PATCH 10/18] AArch64: Refactor types and conversions in filter-prim.cpp
Hari Limaye
hari.limaye at arm.com
Tue Aug 13 15:20:41 UTC 2024
Use correct types for all variables/operations and make all vector
conversions explicit in common/aarch64/filter-prim.cpp.
---
source/common/aarch64/filter-prim.cpp | 79 +++++++++++++++------------
1 file changed, 45 insertions(+), 34 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 1898d362c..23bb6990e 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -23,7 +23,7 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
for (col = 0; col < width; col += 8)
{
- int16x8_t in;
+ uint16x8_t in;
#if HIGH_BIT_DEPTH
in = vld1q_u16(src + col);
@@ -31,7 +31,7 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
in = vmovl_u8(vld1_u8(src + col));
#endif
- int16x8_t tmp = vshlq_n_s16(in, shift);
+ int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
tmp = vsubq_s16(tmp, off);
vst1q_s16(dst + col, tmp);
}
@@ -71,9 +71,10 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
for (int i = 0; i < N; i++)
{
#if HIGH_BIT_DEPTH
- input[i] = vld1q_u16(src + col + i);
+ input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
#else
- input[i] = vmovl_u8(vld1_u8(src + col + i));
+ uint8x8_t in_tmp = vld1_u8(src + col + i);
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(in_tmp));
#endif
}
vsum1 = voffset;
@@ -107,14 +108,14 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
vsum1 = vshlq_s32(vsum1, vhr);
vsum2 = vshlq_s32(vsum2, vhr);
- int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
+ int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
+ vreinterpretq_s16_s32(vsum2));
vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
#if HIGH_BIT_DEPTH
- vst1q_u16(dst + col, vsum);
+ vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
#else
- uint8x16_t usum = vuzp1q_u8(vsum, vsum);
- vst1_u8(dst + col, vget_low_u8(usum));
+ vst1_u8(dst + col, vmovn_u16(vreinterpretq_u16_s16(vsum)));
#endif
}
@@ -154,22 +155,26 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
int16x8_t input[N];
for (int i = 0; i < N; i++)
{
- input[i] = vld1q_u16(src + col + i);
+ input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
}
vsum = voffset;
vsum2 = voffset;
- vsum = vmlal_lane_s16(vsum, vget_low_u16(input[0]), vget_low_s16(vc3), 0);
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input[0]),
+ vget_low_s16(vc3), 0);
vsum2 = vmlal_high_lane_s16(vsum2, input[0], vget_low_s16(vc3), 0);
- vsum = vmlal_lane_s16(vsum, vget_low_u16(input[1]), vget_low_s16(vc3), 1);
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input[1]),
+ vget_low_s16(vc3), 1);
vsum2 = vmlal_high_lane_s16(vsum2, input[1], vget_low_s16(vc3), 1);
- vsum = vmlal_lane_s16(vsum, vget_low_u16(input[2]), vget_low_s16(vc3), 2);
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input[2]),
+ vget_low_s16(vc3), 2);
vsum2 = vmlal_high_lane_s16(vsum2, input[2], vget_low_s16(vc3), 2);
- vsum = vmlal_lane_s16(vsum, vget_low_u16(input[3]), vget_low_s16(vc3), 3);
+ vsum = vmlal_lane_s16(vsum, vget_low_s16(input[3]),
+ vget_low_s16(vc3), 3);
vsum2 = vmlal_high_lane_s16(vsum2, input[3], vget_low_s16(vc3), 3);
if (N == 8)
@@ -233,7 +238,8 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(vld1_u8(src + col + i));
+ uint8x8_t in_tmp = vld1_u8(src + col + i);
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(in_tmp));
}
vsum = voffset;
vsum = vmlaq_laneq_s16(vsum, (input[0]), vc, 0);
@@ -317,7 +323,8 @@ void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, i
vsum1 = vshlq_s32(vsum1, vhr);
vsum2 = vshlq_s32(vsum2, vhr);
- int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
+ int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
+ vreinterpretq_s16_s32(vsum2));
vst1q_s16(dst + col, vsum);
}
@@ -356,7 +363,8 @@ void interp_vert_pp_neon(const uint16_t *src, intptr_t srcStride, uint16_t *dst,
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u16(vld1_u16(src + col + i * srcStride));
+ uint16x4_t in_tmp = vld1_u16(src + col + i * srcStride);
+ input[i] = vreinterpretq_s32_u32(vmovl_u16(in_tmp));
}
vsum = voffset;
@@ -410,7 +418,8 @@ void interp_vert_pp_neon(const uint8_t *src, intptr_t srcStride, uint8_t *dst, i
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(vld1_u8(src + col + i * srcStride));
+ uint8x8_t in_tmp = vld1_u8(src + col + i * srcStride);
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(in_tmp));
}
vsum = voffset;
@@ -460,27 +469,28 @@ void interp_vert_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
{
for (col = 0; col < width; col += 4)
{
- int16x8_t vsum;
+ int32x4_t vsum;
- int16x8_t input[N];
+ int32x4_t input[N];
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u16(vld1_u16(src + col + i * srcStride));
+ uint16x4_t in_tmp = vld1_u16(src + col + i * srcStride);
+ input[i] = vreinterpretq_s32_u32(vmovl_u16(in_tmp));
}
vsum = voffset;
- vsum = vmlaq_laneq_s32(vsum, (input[0]), low_vc, 0);
- vsum = vmlaq_laneq_s32(vsum, (input[1]), low_vc, 1);
- vsum = vmlaq_laneq_s32(vsum, (input[2]), low_vc, 2);
- vsum = vmlaq_laneq_s32(vsum, (input[3]), low_vc, 3);
+ vsum = vmlaq_laneq_s32(vsum, input[0], low_vc, 0);
+ vsum = vmlaq_laneq_s32(vsum, input[1], low_vc, 1);
+ vsum = vmlaq_laneq_s32(vsum, input[2], low_vc, 2);
+ vsum = vmlaq_laneq_s32(vsum, input[3], low_vc, 3);
if (N == 8)
{
- int16x8_t vsum1 = vmulq_laneq_s32((input[4]), high_vc, 0);
- vsum1 = vmlaq_laneq_s32(vsum1, (input[5]), high_vc, 1);
- vsum1 = vmlaq_laneq_s32(vsum1, (input[6]), high_vc, 2);
- vsum1 = vmlaq_laneq_s32(vsum1, (input[7]), high_vc, 3);
+ int32x4_t vsum1 = vmulq_laneq_s32(input[4], high_vc, 0);
+ vsum1 = vmlaq_laneq_s32(vsum1, input[5], high_vc, 1);
+ vsum1 = vmlaq_laneq_s32(vsum1, input[6], high_vc, 2);
+ vsum1 = vmlaq_laneq_s32(vsum1, input[7], high_vc, 3);
vsum = vaddq_s32(vsum, vsum1);
}
@@ -519,7 +529,8 @@ void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, i
for (int i = 0; i < N; i++)
{
- input[i] = vmovl_u8(vld1_u8(src + col + i * srcStride));
+ uint8x8_t in_tmp = vld1_u8(src + col + i * srcStride);
+ input[i] = vreinterpretq_s16_u16(vmovl_u8(in_tmp));
}
vsum = voffset;
@@ -537,7 +548,7 @@ void interp_vert_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, i
vsum = vaddq_s16(vsum, vsum1);
}
- vsum = vshlq_s32(vsum, vhr);
+ vsum = vshlq_s16(vsum, vhr);
vst1q_s16(dst + col, vsum);
}
@@ -614,14 +625,14 @@ void interp_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst, int
vsum1 = vshlq_s32(vsum1, vhr);
vsum2 = vshlq_s32(vsum2, vhr);
- int16x8_t vsum = vuzp1q_s16(vsum1, vsum2);
+ int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
+ vreinterpretq_s16_s32(vsum2));
vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
#if HIGH_BIT_DEPTH
- vst1q_u16(dst + col, vsum);
+ vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
#else
- uint8x16_t usum = vuzp1q_u8(vsum, vsum);
- vst1_u8(dst + col, vget_low_u8(usum));
+ vst1_u8(dst + col, vmovn_u16(vreinterpretq_u16_s16(vsum)));
#endif
}
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0010-AArch64-Refactor-types-and-conversions-in-filter-pri.patch
Type: text/x-patch
Size: 9687 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/a40a1ea0/attachment.bin>
More information about the x265-devel
mailing list