[x265] [PATCH 03/10] AArch64: Optimise HBD interp_horiz_ps_neon
Gerda Zsejke More
gerdazsejke.more at arm.com
Fri Feb 21 16:06:27 UTC 2025
Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_ps_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.
The new 4-tap filter implementation is up to 34% faster when
coeffIdx==4 and up to 11% faster for the other filter values compared
to the existing Neon implementation.
The new 8-tap filter implementation is up to 34% faster when
coeffIdx==1, 48% when it is 2, and 40% when it is 3; compared to the
existing Neon implementation.
---
source/common/aarch64/filter-prim.cpp | 497 +++++++++++++++++++++-----
1 file changed, 413 insertions(+), 84 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index f691b2c36..0ed3fb78c 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -32,6 +32,11 @@
#include <arm_neon.h>
namespace {
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
{
@@ -2707,121 +2712,378 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
}
}
-#endif // !HIGH_BIT_DEPTH
+template<int coeff4>
+void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f,
+ const uint32x4_t offset, int16x4_t &d)
+{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+ uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+ int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+ sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+ // We divided filter values by 4 so -2 from right shift.
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS - 2);
+ }
+ else
+ {
+ uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+ sum = vmlal_lane_u16(sum, s[1], f, 1);
+ sum = vmlal_lane_u16(sum, s[2], f, 2);
+ sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+ d = vshrn_n_s32(vreinterpretq_s32_u32(sum), SHIFT_INTERP_PS);
+ }
}
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f,
+ const uint32x4_t offset, int16x8_t &d)
{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+ uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+ int32x4_t sum_lo = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+ int32x4_t sum_hi = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+ sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+ sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
- intptr_t dstStride, int coeffIdx)
+ // We divided filter values by 4 so -2 from right shift.
+ int16x4_t d0 = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS - 2);
+ int16x4_t d1 = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS - 2);
+ d = vcombine_s16(d0, d1);
+ }
+ else
+ {
+ uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+ sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
+
+ uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+ sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
+
+ int16x4_t d0 = vshrn_n_s32(vreinterpretq_s32_u32(sum_lo),
+ SHIFT_INTERP_PS);
+ int16x4_t d1 = vshrn_n_s32(vreinterpretq_s32_u32(sum_hi),
+ SHIFT_INTERP_PS);
+ d = vcombine_s16(d0, d1);
+ }
+}
+
+template<int coeff4, int width, int height>
+void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx, int isRowExt)
{
- if (N == 8)
+ const int N_TAPS = 4;
+ int blkheight = height;
+ const uint16x4_t filter = vreinterpret_u16_s16(
+ vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx])));
+ uint32x4_t offset;
+
+ if (coeff4)
{
- switch (coeffIdx)
- {
- case 1:
- return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
- dstStride);
- case 2:
- return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
- dstStride);
- case 3:
- return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
- dstStride);
- }
+ // The -2 is needed because we will divide the filter values by 4.
+ offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2));
}
else
{
- switch (coeffIdx)
+ offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+ }
+
+ if (isRowExt)
+ {
+ src -= (N_TAPS / 2 - 1) * srcStride;
+ blkheight += N_TAPS - 1;
+ }
+
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < blkheight; row++)
+ {
+ if (width % 16 == 0)
{
- case 4:
- return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
- dst, dstStride,
- coeffIdx);
- default:
- return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
- dst, dstStride,
- coeffIdx);
+ for (int col = 0; col < width; col += 16)
+ {
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<4>(src + col + 0, 1, s0);
+ load_u16x8xn<4>(src + col + 8, 1, s1);
+
+ int16x8_t d0, d1;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+ filter4_ps_u16x8<coeff4>(s1, filter, offset, d1);
+
+ vst1q_s16(dst + col + 0, d0);
+ vst1q_s16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src + col, 1, s0);
+
+ int16x8_t d0;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+ vst1q_s16(dst + col, d0);
+ }
+
+ if (width == 6)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src, 1, s0);
+
+ int16x8_t d0;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+ store_s16x6xn<1>(dst, dstStride, &d0);
+ }
+ else if (width % 8 != 0)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<4>(src + col, 1, s0);
+
+ int16x4_t d0;
+ filter4_ps_u16x4<coeff4>(s0, filter, offset, d0);
+
+ if (width == 2)
+ {
+ store_s16x2xn<1>(dst + col, dstStride, &d0);
+ }
+ else
+ {
+ vst1_s16(dst + col, d0);
+ }
+ }
}
+
+ src += srcStride;
+ dst += dstStride;
}
}
-#if HIGH_BIT_DEPTH
+template<int coeffIdx>
+void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d,
+ uint32x4_t offset, uint16x8_t filter)
+{
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+ sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
-template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
- int isRowExt)
+ uint32x4_t sum234 = vmlal_laneq_u16(offset, s[3], filter, 3);
+ sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+ sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+ vreinterpret_s16_u16(sum0156));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+ uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+ uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+ uint16x4_t sum34 = vadd_u16(s[3], s[4]);
+
+ uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+ sum0167 = vsub_u16(sum0167, sum07);
+
+ uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3);
+ sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+ vreinterpret_s16_u16(sum0167));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+ sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345 = vmlal_laneq_u16(offset, s[3], filter, 3);
+ sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+ sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+ vreinterpret_s16_u16(sum1267));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+}
+
+template<int coeffIdx>
+void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d,
+ uint32x4_t offset, uint16x8_t filter)
{
- const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
- const int offset = (unsigned) - IF_INTERNAL_OFFS << SHIFT_INTERP_PS;
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+ sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+ uint32x4_t sum234_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+ sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+ sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+ uint32x4_t sum234_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+ sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+ sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+ d = vcombine_s16(d_lo, d_hi);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+ uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+ uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+ uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+ uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+ sum0167 = vsubq_u16(sum0167, sum07);
+
+ uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34),
+ filter, 3);
+ sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+ filter, 2);
+
+ uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34),
+ filter, 3);
+ sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+ filter, 2);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+ d = vcombine_s16(d_lo, d_hi);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+ sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+ sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+ sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+ uint32x4_t sum345_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+ sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+ sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+
+ d = vcombine_s16(d_lo, d_hi);
+ }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int isRowExt)
+{
+ const int N_TAPS = 8;
int blkheight = height;
- src -= N / 2 - 1;
+ const uint16x8_t filter =
+ vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(x265::g_lumaFilter[coeffIdx])));
+ uint32x4_t offset =
+ vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
if (isRowExt)
{
- src -= (N / 2 - 1) * srcStride;
- blkheight += N - 1;
+ src -= (N_TAPS / 2 - 1) * srcStride;
+ blkheight += N_TAPS - 1;
}
- int16x8_t vc3 = vld1q_s16(coeff);
- const int32x4_t voffset = vdupq_n_s32(offset);
- int row, col;
- for (row = 0; row < blkheight; row++)
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < blkheight; row++)
{
- for (col = 0; col < width; col += 8)
+ if (width % 16 == 0)
{
- int32x4_t vsum, vsum2;
-
- int16x8_t input[N];
- for (int i = 0; i < N; i++)
+ for (int col = 0; col < width; col += 16)
{
- input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
- }
-
- vsum = voffset;
- vsum2 = voffset;
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<8>(src + col + 0, 1, s0);
+ load_u16x8xn<8>(src + col + 8, 1, s1);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[0]),
- vget_low_s16(vc3), 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[0], vget_low_s16(vc3), 0);
+ int16x8_t d0, d1;
+ filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
+ filter8_ps_u16x8<coeffIdx>(s1, d1, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[1]),
- vget_low_s16(vc3), 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[1], vget_low_s16(vc3), 1);
+ vst1q_s16(dst + col + 0, d0);
+ vst1q_s16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<8>(src + col, 1, s0);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[2]),
- vget_low_s16(vc3), 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[2], vget_low_s16(vc3), 2);
+ int16x8_t d0;
+ filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[3]),
- vget_low_s16(vc3), 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[3], vget_low_s16(vc3), 3);
+ vst1q_s16(dst + col, d0);
+ }
- if (N == 8)
+ if (width % 8 == 4)
{
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[4]), vget_high_s16(vc3), 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[4], vget_high_s16(vc3), 0);
-
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[5]), vget_high_s16(vc3), 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[5], vget_high_s16(vc3), 1);
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<8>(src + col, 1, s0);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[6]), vget_high_s16(vc3), 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[6], vget_high_s16(vc3), 2);
+ int16x4_t d0;
+ filter8_ps_u16x4<coeffIdx>(s0, d0, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[7]), vget_high_s16(vc3), 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[7], vget_high_s16(vc3), 3);
+ vst1_s16(dst + col, d0);
}
-
- int16x4_t res_lo = vshrn_n_s32(vsum, SHIFT_INTERP_PS);
- int16x4_t res_hi = vshrn_n_s32(vsum2, SHIFT_INTERP_PS);
- vst1q_s16(dst + col, vcombine_s16(res_lo, res_hi));
}
src += srcStride;
@@ -2829,10 +3091,50 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
}
}
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ if (N == 8)
+ {
+ switch (coeffIdx)
+ {
+ case 1:
+ return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
+ dstStride);
+ case 2:
+ return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
+ dstStride);
+ case 3:
+ return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
+ dstStride);
+ }
+ }
+ else
+ {
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
+ dst, dstStride,
+ coeffIdx);
+ default:
+ return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
+ dst, dstStride,
+ coeffIdx);
+ }
+ }
+}
+
template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
- int isRowExt)
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx, int isRowExt)
{
if (N == 8)
{
@@ -2867,8 +3169,6 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
}
}
-#endif // HIGH_BIT_DEPTH
-
template<int N, int width, int height>
void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
{
@@ -3440,6 +3740,35 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_4x8].luma_hpp = interp_horiz_pp_neon<8, 4, 8>;
p.pu[LUMA_4x16].luma_hpp = interp_horiz_pp_neon<8, 4, 16>;
p.pu[LUMA_12x16].luma_hpp = interp_horiz_pp_neon<8, 12, 16>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps = interp_horiz_ps_neon<4, 2, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps = interp_horiz_ps_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps = interp_horiz_ps_neon<4, 4, 2>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = interp_horiz_ps_neon<4, 6, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps = interp_horiz_ps_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps = interp_horiz_ps_neon<4, 2, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hps = interp_horiz_ps_neon<4, 4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps = interp_horiz_ps_neon<4, 6, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = interp_horiz_ps_neon<4, 12, 32>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+ p.pu[LUMA_4x4].luma_hps = interp_horiz_ps_neon<8, 4, 4>;
+ p.pu[LUMA_4x8].luma_hps = interp_horiz_ps_neon<8, 4, 8>;
+ p.pu[LUMA_4x16].luma_hps = interp_horiz_ps_neon<8, 4, 16>;
+ p.pu[LUMA_12x16].luma_hps = interp_horiz_ps_neon<8, 12, 16>;
+
#endif // HIGH_BIT_DEPTH
}
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From adb7a96b61dff73da23666a3283378cd26aeb1ed Mon Sep 17 00:00:00 2001
Message-Id: <adb7a96b61dff73da23666a3283378cd26aeb1ed.1740153395.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1740153395.git.gerdazsejke.more at arm.com>
References: <cover.1740153395.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 19 Jan 2025 10:26:48 +0100
Subject: [PATCH 03/10] AArch64: Optimise HBD interp_horiz_ps_neon
Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_ps_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.
The new 4-tap filter implementation is up to 34% faster when
coeffIdx==4 and up to 11% faster for the other filter values compared
to the existing Neon implementation.
The new 8-tap filter implementation is up to 34% faster when
coeffIdx==1, 48% when it is 2, and 40% when it is 3; compared to the
existing Neon implementation.
---
source/common/aarch64/filter-prim.cpp | 497 +++++++++++++++++++++-----
1 file changed, 413 insertions(+), 84 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index f691b2c36..0ed3fb78c 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -32,6 +32,11 @@
#include <arm_neon.h>
namespace {
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
void inline filter4_s16x8(int coeffIdx, const int16x8_t *s, const int16x4_t f,
const int32x4_t c, int32x4_t &d0, int32x4_t &d1)
{
@@ -2707,121 +2712,378 @@ void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
}
}
-#endif // !HIGH_BIT_DEPTH
+template<int coeff4>
+void inline filter4_ps_u16x4(const uint16x4_t *s, const uint16x4_t f,
+ const uint32x4_t offset, int16x4_t &d)
+{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+ uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+ int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+ sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+ // We divided filter values by 4 so -2 from right shift.
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS - 2);
+ }
+ else
+ {
+ uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+ sum = vmlal_lane_u16(sum, s[1], f, 1);
+ sum = vmlal_lane_u16(sum, s[2], f, 2);
+ sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+ d = vshrn_n_s32(vreinterpretq_s32_u32(sum), SHIFT_INTERP_PS);
+ }
}
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_ps_u16x8(const uint16x8_t *s, const uint16x4_t f,
+ const uint32x4_t offset, int16x8_t &d)
{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+ uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+ int32x4_t sum_lo = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+ int32x4_t sum_hi = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+ sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+ sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
- intptr_t dstStride, int coeffIdx)
+ // We divided filter values by 4 so -2 from right shift.
+ int16x4_t d0 = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS - 2);
+ int16x4_t d1 = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS - 2);
+ d = vcombine_s16(d0, d1);
+ }
+ else
+ {
+ uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+ sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
+
+ uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+ sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
+
+ int16x4_t d0 = vshrn_n_s32(vreinterpretq_s32_u32(sum_lo),
+ SHIFT_INTERP_PS);
+ int16x4_t d1 = vshrn_n_s32(vreinterpretq_s32_u32(sum_hi),
+ SHIFT_INTERP_PS);
+ d = vcombine_s16(d0, d1);
+ }
+}
+
+template<int coeff4, int width, int height>
+void interp4_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx, int isRowExt)
{
- if (N == 8)
+ const int N_TAPS = 4;
+ int blkheight = height;
+ const uint16x4_t filter = vreinterpret_u16_s16(
+ vabs_s16(vld1_s16(x265::g_chromaFilter[coeffIdx])));
+ uint32x4_t offset;
+
+ if (coeff4)
{
- switch (coeffIdx)
- {
- case 1:
- return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
- dstStride);
- case 2:
- return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
- dstStride);
- case 3:
- return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
- dstStride);
- }
+ // The -2 is needed because we will divide the filter values by 4.
+ offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << (SHIFT_INTERP_PS - 2));
}
else
{
- switch (coeffIdx)
+ offset = vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
+ }
+
+ if (isRowExt)
+ {
+ src -= (N_TAPS / 2 - 1) * srcStride;
+ blkheight += N_TAPS - 1;
+ }
+
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < blkheight; row++)
+ {
+ if (width % 16 == 0)
{
- case 4:
- return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
- dst, dstStride,
- coeffIdx);
- default:
- return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
- dst, dstStride,
- coeffIdx);
+ for (int col = 0; col < width; col += 16)
+ {
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<4>(src + col + 0, 1, s0);
+ load_u16x8xn<4>(src + col + 8, 1, s1);
+
+ int16x8_t d0, d1;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+ filter4_ps_u16x8<coeff4>(s1, filter, offset, d1);
+
+ vst1q_s16(dst + col + 0, d0);
+ vst1q_s16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src + col, 1, s0);
+
+ int16x8_t d0;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+ vst1q_s16(dst + col, d0);
+ }
+
+ if (width == 6)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src, 1, s0);
+
+ int16x8_t d0;
+ filter4_ps_u16x8<coeff4>(s0, filter, offset, d0);
+
+ store_s16x6xn<1>(dst, dstStride, &d0);
+ }
+ else if (width % 8 != 0)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<4>(src + col, 1, s0);
+
+ int16x4_t d0;
+ filter4_ps_u16x4<coeff4>(s0, filter, offset, d0);
+
+ if (width == 2)
+ {
+ store_s16x2xn<1>(dst + col, dstStride, &d0);
+ }
+ else
+ {
+ vst1_s16(dst + col, d0);
+ }
+ }
}
+
+ src += srcStride;
+ dst += dstStride;
}
}
-#if HIGH_BIT_DEPTH
+template<int coeffIdx>
+void inline filter8_ps_u16x4(const uint16x4_t *s, int16x4_t &d,
+ uint32x4_t offset, uint16x8_t filter)
+{
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+ sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
-template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
- int isRowExt)
+ uint32x4_t sum234 = vmlal_laneq_u16(offset, s[3], filter, 3);
+ sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+ sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+ vreinterpret_s16_u16(sum0156));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+ uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+ uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+ uint16x4_t sum34 = vadd_u16(s[3], s[4]);
+
+ uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+ sum0167 = vsub_u16(sum0167, sum07);
+
+ uint32x4_t sum2345 = vmlal_laneq_u16(offset, sum34, filter, 3);
+ sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+ vreinterpret_s16_u16(sum0167));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+ sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345 = vmlal_laneq_u16(offset, s[3], filter, 3);
+ sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+ sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
+
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+ vreinterpret_s16_u16(sum1267));
+
+ d = vshrn_n_s32(sum, SHIFT_INTERP_PS);
+ }
+}
+
+template<int coeffIdx>
+void inline filter8_ps_u16x8(const uint16x8_t *s, int16x8_t &d,
+ uint32x4_t offset, uint16x8_t filter)
{
- const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
- const int offset = (unsigned) - IF_INTERNAL_OFFS << SHIFT_INTERP_PS;
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+ sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+ uint32x4_t sum234_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+ sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+ sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+ uint32x4_t sum234_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+ sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+ sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+ d = vcombine_s16(d_lo, d_hi);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+ uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+ uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+ uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+ uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+ sum0167 = vsubq_u16(sum0167, sum07);
+
+ uint32x4_t sum2345_lo = vmlal_laneq_u16(offset, vget_low_u16(sum34),
+ filter, 3);
+ sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+ filter, 2);
+
+ uint32x4_t sum2345_hi = vmlal_laneq_u16(offset, vget_high_u16(sum34),
+ filter, 3);
+ sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+ filter, 2);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+ d = vcombine_s16(d_lo, d_hi);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+ sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345_lo = vmlal_laneq_u16(offset, vget_low_u16(s[3]), filter, 3);
+ sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+ sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+ uint32x4_t sum345_hi = vmlal_laneq_u16(offset, vget_high_u16(s[3]), filter, 3);
+ sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+ sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+ int16x4_t d_lo = vshrn_n_s32(sum_lo, SHIFT_INTERP_PS);
+ int16x4_t d_hi = vshrn_n_s32(sum_hi, SHIFT_INTERP_PS);
+
+ d = vcombine_s16(d_lo, d_hi);
+ }
+}
+
+template<int coeffIdx, int width, int height>
+void interp8_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int isRowExt)
+{
+ const int N_TAPS = 8;
int blkheight = height;
- src -= N / 2 - 1;
+ const uint16x8_t filter =
+ vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(x265::g_lumaFilter[coeffIdx])));
+ uint32x4_t offset =
+ vdupq_n_u32((unsigned)-IF_INTERNAL_OFFS << SHIFT_INTERP_PS);
if (isRowExt)
{
- src -= (N / 2 - 1) * srcStride;
- blkheight += N - 1;
+ src -= (N_TAPS / 2 - 1) * srcStride;
+ blkheight += N_TAPS - 1;
}
- int16x8_t vc3 = vld1q_s16(coeff);
- const int32x4_t voffset = vdupq_n_s32(offset);
- int row, col;
- for (row = 0; row < blkheight; row++)
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < blkheight; row++)
{
- for (col = 0; col < width; col += 8)
+ if (width % 16 == 0)
{
- int32x4_t vsum, vsum2;
-
- int16x8_t input[N];
- for (int i = 0; i < N; i++)
+ for (int col = 0; col < width; col += 16)
{
- input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
- }
-
- vsum = voffset;
- vsum2 = voffset;
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<8>(src + col + 0, 1, s0);
+ load_u16x8xn<8>(src + col + 8, 1, s1);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[0]),
- vget_low_s16(vc3), 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[0], vget_low_s16(vc3), 0);
+ int16x8_t d0, d1;
+ filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
+ filter8_ps_u16x8<coeffIdx>(s1, d1, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[1]),
- vget_low_s16(vc3), 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[1], vget_low_s16(vc3), 1);
+ vst1q_s16(dst + col + 0, d0);
+ vst1q_s16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<8>(src + col, 1, s0);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[2]),
- vget_low_s16(vc3), 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[2], vget_low_s16(vc3), 2);
+ int16x8_t d0;
+ filter8_ps_u16x8<coeffIdx>(s0, d0, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[3]),
- vget_low_s16(vc3), 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[3], vget_low_s16(vc3), 3);
+ vst1q_s16(dst + col, d0);
+ }
- if (N == 8)
+ if (width % 8 == 4)
{
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[4]), vget_high_s16(vc3), 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[4], vget_high_s16(vc3), 0);
-
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[5]), vget_high_s16(vc3), 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[5], vget_high_s16(vc3), 1);
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<8>(src + col, 1, s0);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[6]), vget_high_s16(vc3), 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[6], vget_high_s16(vc3), 2);
+ int16x4_t d0;
+ filter8_ps_u16x4<coeffIdx>(s0, d0, offset, filter);
- vsum = vmlal_lane_s16(vsum, vget_low_s16(input[7]), vget_high_s16(vc3), 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[7], vget_high_s16(vc3), 3);
+ vst1_s16(dst + col, d0);
}
-
- int16x4_t res_lo = vshrn_n_s32(vsum, SHIFT_INTERP_PS);
- int16x4_t res_hi = vshrn_n_s32(vsum2, SHIFT_INTERP_PS);
- vst1q_s16(dst + col, vcombine_s16(res_lo, res_hi));
}
src += srcStride;
@@ -2829,10 +3091,50 @@ void interp_horiz_ps_neon(const uint16_t *src, intptr_t srcStride, int16_t *dst,
}
}
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+template<int N, int width, int height>
+void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ if (N == 8)
+ {
+ switch (coeffIdx)
+ {
+ case 1:
+ return interp8_horiz_pp_neon<1, width, height>(src, srcStride, dst,
+ dstStride);
+ case 2:
+ return interp8_horiz_pp_neon<2, width, height>(src, srcStride, dst,
+ dstStride);
+ case 3:
+ return interp8_horiz_pp_neon<3, width, height>(src, srcStride, dst,
+ dstStride);
+ }
+ }
+ else
+ {
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp4_horiz_pp_neon<true, width, height>(src, srcStride,
+ dst, dstStride,
+ coeffIdx);
+ default:
+ return interp4_horiz_pp_neon<false, width, height>(src, srcStride,
+ dst, dstStride,
+ coeffIdx);
+ }
+ }
+}
+
template<int N, int width, int height>
-void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx,
- int isRowExt)
+void interp_horiz_ps_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx, int isRowExt)
{
if (N == 8)
{
@@ -2867,8 +3169,6 @@ void interp_horiz_ps_neon(const uint8_t *src, intptr_t srcStride, int16_t *dst,
}
}
-#endif // HIGH_BIT_DEPTH
-
template<int N, int width, int height>
void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
{
@@ -3440,6 +3740,35 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_4x8].luma_hpp = interp_horiz_pp_neon<8, 4, 8>;
p.pu[LUMA_4x16].luma_hpp = interp_horiz_pp_neon<8, 4, 16>;
p.pu[LUMA_12x16].luma_hpp = interp_horiz_pp_neon<8, 12, 16>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps = interp_horiz_ps_neon<4, 2, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps = interp_horiz_ps_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hps = interp_horiz_ps_neon<4, 4, 2>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hps = interp_horiz_ps_neon<4, 6, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps = interp_horiz_ps_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps = interp_horiz_ps_neon<4, 2, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hps = interp_horiz_ps_neon<4, 4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hps = interp_horiz_ps_neon<4, 6, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hps = interp_horiz_ps_neon<4, 12, 32>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hps = interp_horiz_ps_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hps = interp_horiz_ps_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hps = interp_horiz_ps_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hps = interp_horiz_ps_neon<4, 12, 16>;
+
+ p.pu[LUMA_4x4].luma_hps = interp_horiz_ps_neon<8, 4, 4>;
+ p.pu[LUMA_4x8].luma_hps = interp_horiz_ps_neon<8, 4, 8>;
+ p.pu[LUMA_4x16].luma_hps = interp_horiz_ps_neon<8, 4, 16>;
+ p.pu[LUMA_12x16].luma_hps = interp_horiz_ps_neon<8, 12, 16>;
+
#endif // HIGH_BIT_DEPTH
}
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list