[x265] [PATCH v2 01/10] AArch64: Optimise HBD interp_horiz_pp_neon
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Mar 11 19:45:11 UTC 2025
Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_pp_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.
The new 4-tap filter implementation is up to 37% faster when
coeffIdx==4 and up to 20% faster for the other filter values compared
to the existing Neon implementation.
The new 8-tap filter implementation is up to 42% faster when
coeffIdx==1, 51% when it is 2, and 44% when it is 3; compared to the
existing Neon implementation.
---
source/common/aarch64/filter-prim.cpp | 546 +++++++++++++++++++++-----
source/common/aarch64/mem-neon.h | 48 ++-
2 files changed, 497 insertions(+), 97 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 71dfc0d63..ecf0dc141 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,3 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Liwei Wang <liwei at multicorewareinc.com>
+ * Jonathan Swinney <jswinney at amazon.com>
+ * Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
#if HAVE_NEON
#include "filter-prim.h"
@@ -2049,66 +2075,172 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
}
}
-#endif // !HIGH_BIT_DEPTH
+#else // !HIGH_BIT_DEPTH
+
+template<bool coeff4>
+void inline filter4_u16x4(const uint16x4_t *s, const uint16x4_t f,
+ const uint32x4_t offset, const uint16x4_t maxVal,
+ uint16x4_t &d)
+{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+ uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+ int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+ sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+ // We divided filter values by 4 so -2 from right shift.
+ d = vqshrun_n_s32(sum, IF_FILTER_PREC - 2);
+ d = vmin_u16(d, maxVal);
+ }
+ else
+ {
+ uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+ sum = vmlal_lane_u16(sum, s[1], f, 1);
+ sum = vmlal_lane_u16(sum, s[2], f, 2);
+ sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+ d = vqshrun_n_s32(vreinterpretq_s32_u32(sum), IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
}
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_u16x8(const uint16x8_t *s, const uint16x4_t f,
+ const uint32x4_t offset, const uint16x8_t maxVal,
+ uint16x8_t &d)
{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+ uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
+
+ int32x4_t sum_lo = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+ int32x4_t sum_hi = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+ sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+ sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
+
+ // We divided filter values by 4 so -2 from right shift.
+ uint16x4_t d0 = vqshrun_n_s32(sum_lo, IF_FILTER_PREC - 2);
+ uint16x4_t d1 = vqshrun_n_s32(sum_hi, IF_FILTER_PREC - 2);
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+ }
+ else
+ {
+ uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+ sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+ uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+ sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
-template<int width, int height>
-void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+ uint16x4_t d0 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_lo),
+ IF_FILTER_PREC);
+ uint16x4_t d1 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_hi),
+ IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+ }
+}
+
+template<bool coeff4, int width, int height>
+void inline interp4_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+ pixel *dst, intptr_t dstStride,
+ const int16_t coeffIdx)
{
- const int shift = IF_INTERNAL_PREC - X265_DEPTH;
- const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
- for (int row = 0; row < height; row++)
+ const int N_TAPS = 4;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+ const uint16x4_t filter = vreinterpret_u16_s16(
+ vabs_s16(vld1_s16(X265_NS::g_chromaFilter[coeffIdx])));
+ uint32x4_t offset;
+
+ // A shim of 1 << (IF_FILTER_PREC - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ if (coeff4)
{
+ // The outermost -2 is needed because we will divide the filter values by 4.
+ offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1 - 2));
+ }
+ else
+ {
+ offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1));
+ }
- int col = 0;
- for (; col + 8 <= width; col += 8)
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < height; row++)
+ {
+ if (width % 16 == 0)
{
- uint16x8_t in;
+ for (int col = 0; col < width; col += 16)
+ {
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<4>(src + col + 0, 1, s0);
+ load_u16x8xn<4>(src + col + 8, 1, s1);
-#if HIGH_BIT_DEPTH
- in = vld1q_u16(src + col);
-#else
- in = vmovl_u8(vld1_u8(src + col));
-#endif
+ uint16x8_t d0, d1;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+ filter4_u16x8<coeff4>(s1, filter, offset, maxVal, d1);
- int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
- tmp = vsubq_s16(tmp, off);
- vst1q_s16(dst + col, tmp);
+ vst1q_u16(dst + col + 0, d0);
+ vst1q_u16(dst + col + 8, d1);
+ }
}
- for (; col + 4 <= width; col += 4)
+ else
{
- uint16x4_t in;
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src + col, 1, s0);
-#if HIGH_BIT_DEPTH
- in = vld1_u16(src + col);
-#else
- in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+ uint16x8_t d0;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
- int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
- tmp = vsub_s16(tmp, vget_low_s16(off));
- vst1_s16(dst + col, tmp);
- }
- for (; col < width; col += 2)
- {
- uint16x4_t in;
+ vst1q_u16(dst + col, d0);
+ }
-#if HIGH_BIT_DEPTH
- in = vld1_u16(src + col);
-#else
- in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+ if (width == 6)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src, 1, s0);
- int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
- tmp = vsub_s16(tmp, vget_low_s16(off));
- store_s16x2xn<1>(dst + col, dstStride, &tmp);
+ uint16x8_t d0;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+
+ store_u16x6xn<1>(dst, dstStride, &d0);
+ }
+ else if (width % 8 != 0)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<4>(src + col, 1, s0);
+
+ uint16x4_t d0;
+ filter4_u16x4<coeff4>(s0, filter, offset,
+ vget_low_u16(maxVal), d0);
+
+ if (width == 2)
+ {
+ store_u16x2xn<1>(dst + col, dstStride, &d0);
+ }
+ else
+ {
+ vst1_u16(dst + col, d0);
+ }
+ }
}
src += srcStride;
@@ -2116,73 +2248,205 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
}
}
-#if HIGH_BIT_DEPTH
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+template<int coeffIdx>
+void inline filter8_u16x4(const uint16x4_t *s, uint16x4_t &d,
+ uint16x8_t filter, uint16x4_t maxVal)
{
- const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
- int headRoom = IF_FILTER_PREC;
- int offset = (1 << (headRoom - 1));
- uint16_t maxVal = (1 << X265_DEPTH) - 1;
- int cStride = 1;
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+ sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
- src -= (N / 2 - 1) * cStride;
- int16x8_t vc = vld1q_s16(coeff);
- int16x4_t low_vc = vget_low_s16(vc);
- int16x4_t high_vc = vget_high_s16(vc);
+ uint32x4_t sum234 = vmull_laneq_u16(s[3], filter, 3);
+ sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+ sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
- const int32x4_t voffset = vdupq_n_s32(offset);
- const int32x4_t vhr = vdupq_n_s32(-headRoom);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+ vreinterpret_s16_u16(sum0156));
- int row, col;
- for (row = 0; row < height; row++)
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+ else if (coeffIdx == 2)
{
- for (col = 0; col < width; col += 8)
- {
- int32x4_t vsum1, vsum2;
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+ uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+ uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+ uint16x4_t sum34 = vadd_u16(s[3], s[4]);
- int16x8_t input[N];
+ uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+ sum0167 = vsub_u16(sum0167, sum07);
- for (int i = 0; i < N; i++)
- {
- input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
- }
- vsum1 = voffset;
- vsum2 = voffset;
+ uint32x4_t sum2345 = vmull_laneq_u16(sum34, filter, 3);
+ sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[0]), low_vc, 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[0], low_vc, 0);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+ vreinterpret_s16_u16(sum0167));
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+ sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
+ uint32x4_t sum345 = vmull_laneq_u16(s[3], filter, 3);
+ sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+ sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+ vreinterpret_s16_u16(sum1267));
- if (N == 8)
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+}
+
+template<int coeffIdx>
+void inline filter8_u16x8(const uint16x8_t *s, uint16x8_t &d, uint16x8_t filter,
+ uint16x8_t maxVal)
+{
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+ sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+ uint32x4_t sum234_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+ sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+ sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+
+ uint32x4_t sum234_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+ sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+ sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+ uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+ uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+ uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+ uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+ sum0167 = vsubq_u16(sum0167, sum07);
+
+ uint32x4_t sum2345_lo = vmull_laneq_u16(vget_low_u16(sum34),
+ filter, 3);
+ sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+ filter, 2);
+
+ uint32x4_t sum2345_hi = vmull_laneq_u16(vget_high_u16(sum34),
+ filter, 3);
+ sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+ filter, 2);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+ sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+ sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+ sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+ uint32x4_t sum345_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+ sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+ sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+}
+
+template<int coeffIdx, int width, int height>
+void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+ pixel *dst, intptr_t dstStride)
+{
+ const int N_TAPS = 8;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+
+ const uint16x8_t filter =
+ vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(X265_NS::g_lumaFilter[coeffIdx])));
+
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < height; row++)
+ {
+ if (width % 16 == 0)
+ {
+ for (int col = 0; col < width; col += 16)
{
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<8>(src + col + 0, 1, s0);
+ load_u16x8xn<8>(src + col + 8, 1, s1);
+
+ uint16x8_t d0, d1;
+ filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+ filter8_u16x8<coeffIdx>(s1, d1, filter, maxVal);
+
+ vst1q_u16(dst + col + 0, d0);
+ vst1q_u16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<8>(src + col, 1, s0);
+
+ uint16x8_t d0;
+ filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+ vst1q_u16(dst + col, d0);
}
- vsum1 = vshlq_s32(vsum1, vhr);
- vsum2 = vshlq_s32(vsum2, vhr);
+ if (width % 8 == 4)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<8>(src + col, 1, s0);
- int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
- vreinterpretq_s16_s32(vsum2));
- vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
- vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
- vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
+ uint16x4_t d0;
+ filter8_u16x4<coeffIdx>(s0, d0, filter, vget_low_u16(maxVal));
+
+ vst1_u16(dst + col, d0);
+ }
}
src += srcStride;
@@ -2190,7 +2454,16 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
}
}
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
template<int N, int width, int height>
void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
intptr_t dstStride, int coeffIdx)
@@ -2226,8 +2499,6 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
}
}
-#endif // HIGH_BIT_DEPTH
-
#if HIGH_BIT_DEPTH
template<int N, int width, int height>
@@ -2676,11 +2947,64 @@ void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_
interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
}
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+ const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+ const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
+ for (int row = 0; row < height; row++)
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t in;
+#if HIGH_BIT_DEPTH
+ in = vld1q_u16(src + col);
+#else
+ in = vmovl_u8(vld1_u8(src + col));
+#endif
+ int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
+ tmp = vsubq_s16(tmp, off);
+ vst1q_s16(dst + col, tmp);
+ }
+ for (; col + 4 <= width; col += 4)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ vst1_s16(dst + col, tmp);
+ }
+
+ for (; col < width; col += 2)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ store_s16x2xn<1>(dst + col, dstStride, &tmp);
+ }
+
+ src += srcStride;
+ dst += dstStride;
+ }
+}
#define CHROMA_420(W, H) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
@@ -2834,6 +3158,36 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
CHROMA_444(64, 32);
CHROMA_444(64, 48);
CHROMA_444(64, 64);
+
+#if HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = interp_horiz_pp_neon<4, 2, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = interp_horiz_pp_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = interp_horiz_pp_neon<4, 4, 2>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp = interp_horiz_pp_neon<4, 6, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = interp_horiz_pp_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = interp_horiz_pp_neon<4, 2, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = interp_horiz_pp_neon<4, 4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp = interp_horiz_pp_neon<4, 6, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = interp_horiz_pp_neon<4, 12, 32>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+ p.pu[LUMA_4x4].luma_hpp = interp_horiz_pp_neon<8, 4, 4>;
+ p.pu[LUMA_4x8].luma_hpp = interp_horiz_pp_neon<8, 4, 8>;
+ p.pu[LUMA_4x16].luma_hpp = interp_horiz_pp_neon<8, 4, 16>;
+ p.pu[LUMA_12x16].luma_hpp = interp_horiz_pp_neon<8, 12, 16>;
+#endif // HIGH_BIT_DEPTH
}
};
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 90788a938..2c6edfccb 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -1,7 +1,8 @@
/*****************************************************************************
- * Copyright (C) 2024 MulticoreWare, Inc
+ * Copyright (C) 2024-2025 MulticoreWare, Inc
*
* Authors: Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -185,6 +186,51 @@ static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
}
}
+template<int N>
+static void inline load_u16x4xn(const uint16_t *src, const intptr_t stride,
+ uint16x4_t *dst)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ dst[i] = vld1_u16(src);
+ src += stride;
+ }
+}
+
+template<int N>
+static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
+ uint16x8_t *dst)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ dst[i] = vld1q_u16(src);
+ src += stride;
+ }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+ const uint16x4_t *src)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u16(src[i]), 0);
+ dst += dst_stride;
+ }
+}
+
+template<int N>
+static void inline store_u16x6xn(uint16_t *dst, intptr_t dst_stride,
+ const uint16x8_t *src)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ vst1_u16(dst, vget_low_u16(src[i]));
+ vst1q_lane_u32((uint32_t *)(dst + 4), vreinterpretq_u32_u16(src[i]), 2);
+ dst += dst_stride;
+ }
+}
+
template<int N>
static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
const int16x4_t *src)
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From c7fb942a2324d4ce3682580626eb57f0e2e3ac7e Mon Sep 17 00:00:00 2001
Message-Id: <c7fb942a2324d4ce3682580626eb57f0e2e3ac7e.1741721714.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1741721714.git.gerdazsejke.more at arm.com>
References: <cover.1741721714.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 24 Nov 2024 12:50:21 +0100
Subject: [PATCH v2 01/10] AArch64: Optimise HBD interp_horiz_pp_neon
Optimise the HBD 4-tap and 8-tap Neon implementations of
interp_horiz_pp_neon and extend these functions to support all CHROMA
and LUMA block sizes respectively.
The new 4-tap filter implementation is up to 37% faster when
coeffIdx==4 and up to 20% faster for the other filter values compared
to the existing Neon implementation.
The new 8-tap filter implementation is up to 42% faster when
coeffIdx==1, 51% when it is 2, and 44% when it is 3; compared to the
existing Neon implementation.
---
source/common/aarch64/filter-prim.cpp | 546 +++++++++++++++++++++-----
source/common/aarch64/mem-neon.h | 48 ++-
2 files changed, 497 insertions(+), 97 deletions(-)
diff --git a/source/common/aarch64/filter-prim.cpp b/source/common/aarch64/filter-prim.cpp
index 71dfc0d63..ecf0dc141 100644
--- a/source/common/aarch64/filter-prim.cpp
+++ b/source/common/aarch64/filter-prim.cpp
@@ -1,3 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2021-2025 MulticoreWare, Inc
+ *
+ * Authors: Liwei Wang <liwei at multicorewareinc.com>
+ * Jonathan Swinney <jswinney at amazon.com>
+ * Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
#if HAVE_NEON
#include "filter-prim.h"
@@ -2049,66 +2075,172 @@ void interp8_vert_sp_neon(const int16_t *src, intptr_t srcStride, pixel *dst,
}
}
-#endif // !HIGH_BIT_DEPTH
+#else // !HIGH_BIT_DEPTH
+
+template<bool coeff4>
+void inline filter4_u16x4(const uint16x4_t *s, const uint16x4_t f,
+ const uint32x4_t offset, const uint16x4_t maxVal,
+ uint16x4_t &d)
+{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x4_t sum03 = vadd_u16(s[0], s[3]);
+ uint16x4_t sum12 = vadd_u16(s[1], s[2]);
+
+ int32x4_t sum = vreinterpretq_s32_u32(vmlal_n_u16(offset, sum12, 9));
+ sum = vsubw_s16(sum, vreinterpret_s16_u16(sum03));
+
+ // We divided filter values by 4 so -2 from right shift.
+ d = vqshrun_n_s32(sum, IF_FILTER_PREC - 2);
+ d = vmin_u16(d, maxVal);
+ }
+ else
+ {
+ uint32x4_t sum = vmlsl_lane_u16(offset, s[0], f, 0);
+ sum = vmlal_lane_u16(sum, s[1], f, 1);
+ sum = vmlal_lane_u16(sum, s[2], f, 2);
+ sum = vmlsl_lane_u16(sum, s[3], f, 3);
+
+ d = vqshrun_n_s32(vreinterpretq_s32_u32(sum), IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
}
-namespace X265_NS
+template<bool coeff4>
+void inline filter4_u16x8(const uint16x8_t *s, const uint16x4_t f,
+ const uint32x4_t offset, const uint16x8_t maxVal,
+ uint16x8_t &d)
{
+ if (coeff4)
+ {
+ // { -4, 36, 36, -4 }
+ // Filter values are divisible by 4, factor that out in order to only
+ // need a multiplication by 9 and a subtraction (which is a
+ // multiplication by -1).
+ uint16x8_t sum03 = vaddq_u16(s[0], s[3]);
+ uint16x8_t sum12 = vaddq_u16(s[1], s[2]);
+
+ int32x4_t sum_lo = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_low_u16(sum12), 9));
+ int32x4_t sum_hi = vreinterpretq_s32_u32(
+ vmlal_n_u16(offset, vget_high_u16(sum12), 9));
+ sum_lo = vsubw_s16(sum_lo, vreinterpret_s16_u16(vget_low_u16(sum03)));
+ sum_hi = vsubw_s16(sum_hi, vreinterpret_s16_u16(vget_high_u16(sum03)));
+
+ // We divided filter values by 4 so -2 from right shift.
+ uint16x4_t d0 = vqshrun_n_s32(sum_lo, IF_FILTER_PREC - 2);
+ uint16x4_t d1 = vqshrun_n_s32(sum_hi, IF_FILTER_PREC - 2);
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+ }
+ else
+ {
+ uint32x4_t sum_lo = vmlsl_lane_u16(offset, vget_low_u16(s[0]), f, 0);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[1]), f, 1);
+ sum_lo = vmlal_lane_u16(sum_lo, vget_low_u16(s[2]), f, 2);
+ sum_lo = vmlsl_lane_u16(sum_lo, vget_low_u16(s[3]), f, 3);
-#if HIGH_BIT_DEPTH
-#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
-#endif
+ uint32x4_t sum_hi = vmlsl_lane_u16(offset, vget_high_u16(s[0]), f, 0);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[1]), f, 1);
+ sum_hi = vmlal_lane_u16(sum_hi, vget_high_u16(s[2]), f, 2);
+ sum_hi = vmlsl_lane_u16(sum_hi, vget_high_u16(s[3]), f, 3);
-template<int width, int height>
-void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+ uint16x4_t d0 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_lo),
+ IF_FILTER_PREC);
+ uint16x4_t d1 = vqshrun_n_s32(vreinterpretq_s32_u32(sum_hi),
+ IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+ }
+}
+
+template<bool coeff4, int width, int height>
+void inline interp4_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+ pixel *dst, intptr_t dstStride,
+ const int16_t coeffIdx)
{
- const int shift = IF_INTERNAL_PREC - X265_DEPTH;
- const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
- for (int row = 0; row < height; row++)
+ const int N_TAPS = 4;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+ const uint16x4_t filter = vreinterpret_u16_s16(
+ vabs_s16(vld1_s16(X265_NS::g_chromaFilter[coeffIdx])));
+ uint32x4_t offset;
+
+ // A shim of 1 << (IF_FILTER_PREC - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ if (coeff4)
{
+ // The outermost -2 is needed because we will divide the filter values by 4.
+ offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1 - 2));
+ }
+ else
+ {
+ offset = vdupq_n_u32(1 << (IF_FILTER_PREC - 1));
+ }
- int col = 0;
- for (; col + 8 <= width; col += 8)
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < height; row++)
+ {
+ if (width % 16 == 0)
{
- uint16x8_t in;
+ for (int col = 0; col < width; col += 16)
+ {
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<4>(src + col + 0, 1, s0);
+ load_u16x8xn<4>(src + col + 8, 1, s1);
-#if HIGH_BIT_DEPTH
- in = vld1q_u16(src + col);
-#else
- in = vmovl_u8(vld1_u8(src + col));
-#endif
+ uint16x8_t d0, d1;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+ filter4_u16x8<coeff4>(s1, filter, offset, maxVal, d1);
- int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
- tmp = vsubq_s16(tmp, off);
- vst1q_s16(dst + col, tmp);
+ vst1q_u16(dst + col + 0, d0);
+ vst1q_u16(dst + col + 8, d1);
+ }
}
- for (; col + 4 <= width; col += 4)
+ else
{
- uint16x4_t in;
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src + col, 1, s0);
-#if HIGH_BIT_DEPTH
- in = vld1_u16(src + col);
-#else
- in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+ uint16x8_t d0;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
- int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
- tmp = vsub_s16(tmp, vget_low_s16(off));
- vst1_s16(dst + col, tmp);
- }
- for (; col < width; col += 2)
- {
- uint16x4_t in;
+ vst1q_u16(dst + col, d0);
+ }
-#if HIGH_BIT_DEPTH
- in = vld1_u16(src + col);
-#else
- in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
-#endif
+ if (width == 6)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<4>(src, 1, s0);
- int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
- tmp = vsub_s16(tmp, vget_low_s16(off));
- store_s16x2xn<1>(dst + col, dstStride, &tmp);
+ uint16x8_t d0;
+ filter4_u16x8<coeff4>(s0, filter, offset, maxVal, d0);
+
+ store_u16x6xn<1>(dst, dstStride, &d0);
+ }
+ else if (width % 8 != 0)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<4>(src + col, 1, s0);
+
+ uint16x4_t d0;
+ filter4_u16x4<coeff4>(s0, filter, offset,
+ vget_low_u16(maxVal), d0);
+
+ if (width == 2)
+ {
+ store_u16x2xn<1>(dst + col, dstStride, &d0);
+ }
+ else
+ {
+ vst1_u16(dst + col, d0);
+ }
+ }
}
src += srcStride;
@@ -2116,73 +2248,205 @@ void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst,
}
}
-#if HIGH_BIT_DEPTH
-template<int N, int width, int height>
-void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+template<int coeffIdx>
+void inline filter8_u16x4(const uint16x4_t *s, uint16x4_t &d,
+ uint16x8_t filter, uint16x4_t maxVal)
{
- const int16_t *coeff = (N == 4) ? g_chromaFilter[coeffIdx] : g_lumaFilter[coeffIdx];
- int headRoom = IF_FILTER_PREC;
- int offset = (1 << (headRoom - 1));
- uint16_t maxVal = (1 << X265_DEPTH) - 1;
- int cStride = 1;
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x4_t sum0156 = vsub_u16(s[6], s[0]);
+ sum0156 = vmla_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmls_laneq_u16(sum0156, s[5], filter, 5);
- src -= (N / 2 - 1) * cStride;
- int16x8_t vc = vld1q_s16(coeff);
- int16x4_t low_vc = vget_low_s16(vc);
- int16x4_t high_vc = vget_high_s16(vc);
+ uint32x4_t sum234 = vmull_laneq_u16(s[3], filter, 3);
+ sum234 = vmlsl_laneq_u16(sum234, s[2], filter, 2);
+ sum234 = vmlal_laneq_u16(sum234, s[4], filter, 4);
- const int32x4_t voffset = vdupq_n_s32(offset);
- const int32x4_t vhr = vdupq_n_s32(-headRoom);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum234),
+ vreinterpret_s16_u16(sum0156));
- int row, col;
- for (row = 0; row < height; row++)
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+ else if (coeffIdx == 2)
{
- for (col = 0; col < width; col += 8)
- {
- int32x4_t vsum1, vsum2;
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x4_t sum07 = vadd_u16(s[0], s[7]);
+ uint16x4_t sum16 = vadd_u16(s[1], s[6]);
+ uint16x4_t sum25 = vadd_u16(s[2], s[5]);
+ uint16x4_t sum34 = vadd_u16(s[3], s[4]);
- int16x8_t input[N];
+ uint16x4_t sum0167 = vshl_n_u16(sum16, 2);
+ sum0167 = vsub_u16(sum0167, sum07);
- for (int i = 0; i < N; i++)
- {
- input[i] = vreinterpretq_s16_u16(vld1q_u16(src + col + i));
- }
- vsum1 = voffset;
- vsum2 = voffset;
+ uint32x4_t sum2345 = vmull_laneq_u16(sum34, filter, 3);
+ sum2345 = vmlsl_laneq_u16(sum2345, sum25, filter, 2);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[0]), low_vc, 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[0], low_vc, 0);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum2345),
+ vreinterpret_s16_u16(sum0167));
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[1]), low_vc, 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[1], low_vc, 1);
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x4_t sum1267 = vsub_u16(s[1], s[7]);
+ sum1267 = vmls_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmla_laneq_u16(sum1267, s[6], filter, 6);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[2]), low_vc, 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[2], low_vc, 2);
+ uint32x4_t sum345 = vmull_laneq_u16(s[3], filter, 3);
+ sum345 = vmlal_laneq_u16(sum345, s[4], filter, 4);
+ sum345 = vmlsl_laneq_u16(sum345, s[5], filter, 5);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[3]), low_vc, 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[3], low_vc, 3);
+ int32x4_t sum = vaddw_s16(vreinterpretq_s32_u32(sum345),
+ vreinterpret_s16_u16(sum1267));
- if (N == 8)
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+ }
+}
+
+template<int coeffIdx>
+void inline filter8_u16x8(const uint16x8_t *s, uint16x8_t &d, uint16x8_t filter,
+ uint16x8_t maxVal)
+{
+ if (coeffIdx == 1)
+ {
+ // { -1, 4, -10, 58, 17, -5, 1, 0 }
+ uint16x8_t sum0156 = vsubq_u16(s[6], s[0]);
+ sum0156 = vmlaq_laneq_u16(sum0156, s[1], filter, 1);
+ sum0156 = vmlsq_laneq_u16(sum0156, s[5], filter, 5);
+
+ uint32x4_t sum234_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+ sum234_lo = vmlsl_laneq_u16(sum234_lo, vget_low_u16(s[2]), filter, 2);
+ sum234_lo = vmlal_laneq_u16(sum234_lo, vget_low_u16(s[4]), filter, 4);
+
+ uint32x4_t sum234_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+ sum234_hi = vmlsl_laneq_u16(sum234_hi, vget_high_u16(s[2]), filter, 2);
+ sum234_hi = vmlal_laneq_u16(sum234_hi, vget_high_u16(s[4]), filter, 4);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum234_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0156)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum234_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0156)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+ else if (coeffIdx == 2)
+ {
+ // { -1, 4, -11, 40, 40, -11, 4, -1 }
+ uint16x8_t sum07 = vaddq_u16(s[0], s[7]);
+ uint16x8_t sum16 = vaddq_u16(s[1], s[6]);
+ uint16x8_t sum25 = vaddq_u16(s[2], s[5]);
+ uint16x8_t sum34 = vaddq_u16(s[3], s[4]);
+
+ uint16x8_t sum0167 = vshlq_n_u16(sum16, 2);
+ sum0167 = vsubq_u16(sum0167, sum07);
+
+ uint32x4_t sum2345_lo = vmull_laneq_u16(vget_low_u16(sum34),
+ filter, 3);
+ sum2345_lo = vmlsl_laneq_u16(sum2345_lo, vget_low_u16(sum25),
+ filter, 2);
+
+ uint32x4_t sum2345_hi = vmull_laneq_u16(vget_high_u16(sum34),
+ filter, 3);
+ sum2345_hi = vmlsl_laneq_u16(sum2345_hi, vget_high_u16(sum25),
+ filter, 2);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum2345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum0167)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum2345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum0167)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+ else
+ {
+ // { 0, 1, -5, 17, 58, -10, 4, -1 }
+ uint16x8_t sum1267 = vsubq_u16(s[1], s[7]);
+ sum1267 = vmlsq_laneq_u16(sum1267, s[2], filter, 2);
+ sum1267 = vmlaq_laneq_u16(sum1267, s[6], filter, 6);
+
+ uint32x4_t sum345_lo = vmull_laneq_u16(vget_low_u16(s[3]), filter, 3);
+ sum345_lo = vmlal_laneq_u16(sum345_lo, vget_low_u16(s[4]), filter, 4);
+ sum345_lo = vmlsl_laneq_u16(sum345_lo, vget_low_u16(s[5]), filter, 5);
+
+ uint32x4_t sum345_hi = vmull_laneq_u16(vget_high_u16(s[3]), filter, 3);
+ sum345_hi = vmlal_laneq_u16(sum345_hi, vget_high_u16(s[4]), filter, 4);
+ sum345_hi = vmlsl_laneq_u16(sum345_hi, vget_high_u16(s[5]), filter, 5);
+
+ int32x4_t sum_lo = vaddw_s16(vreinterpretq_s32_u32(sum345_lo),
+ vget_low_s16(vreinterpretq_s16_u16(sum1267)));
+ int32x4_t sum_hi = vaddw_s16(vreinterpretq_s32_u32(sum345_hi),
+ vget_high_s16(vreinterpretq_s16_u16(sum1267)));
+
+ uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal);
+ }
+}
+
+template<int coeffIdx, int width, int height>
+void inline interp8_horiz_pp_neon(const pixel *src, intptr_t srcStride,
+ pixel *dst, intptr_t dstStride)
+{
+ const int N_TAPS = 8;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+
+ const uint16x8_t filter =
+ vreinterpretq_u16_s16(vabsq_s16(vld1q_s16(X265_NS::g_lumaFilter[coeffIdx])));
+
+ src -= N_TAPS / 2 - 1;
+
+ for (int row = 0; row < height; row++)
+ {
+ if (width % 16 == 0)
+ {
+ for (int col = 0; col < width; col += 16)
{
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[4]), high_vc, 0);
- vsum2 = vmlal_high_lane_s16(vsum2, input[4], high_vc, 0);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[5]), high_vc, 1);
- vsum2 = vmlal_high_lane_s16(vsum2, input[5], high_vc, 1);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[6]), high_vc, 2);
- vsum2 = vmlal_high_lane_s16(vsum2, input[6], high_vc, 2);
- vsum1 = vmlal_lane_s16(vsum1, vget_low_s16(input[7]), high_vc, 3);
- vsum2 = vmlal_high_lane_s16(vsum2, input[7], high_vc, 3);
+ uint16x8_t s0[N_TAPS], s1[N_TAPS];
+ load_u16x8xn<8>(src + col + 0, 1, s0);
+ load_u16x8xn<8>(src + col + 8, 1, s1);
+
+ uint16x8_t d0, d1;
+ filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+ filter8_u16x8<coeffIdx>(s1, d1, filter, maxVal);
+
+ vst1q_u16(dst + col + 0, d0);
+ vst1q_u16(dst + col + 8, d1);
+ }
+ }
+ else
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t s0[N_TAPS];
+ load_u16x8xn<8>(src + col, 1, s0);
+
+ uint16x8_t d0;
+ filter8_u16x8<coeffIdx>(s0, d0, filter, maxVal);
+ vst1q_u16(dst + col, d0);
}
- vsum1 = vshlq_s32(vsum1, vhr);
- vsum2 = vshlq_s32(vsum2, vhr);
+ if (width % 8 == 4)
+ {
+ uint16x4_t s0[N_TAPS];
+ load_u16x4xn<8>(src + col, 1, s0);
- int16x8_t vsum = vuzp1q_s16(vreinterpretq_s16_s32(vsum1),
- vreinterpretq_s16_s32(vsum2));
- vsum = vminq_s16(vsum, vdupq_n_s16(maxVal));
- vsum = vmaxq_s16(vsum, vdupq_n_s16(0));
- vst1q_u16(dst + col, vreinterpretq_u16_s16(vsum));
+ uint16x4_t d0;
+ filter8_u16x4<coeffIdx>(s0, d0, filter, vget_low_u16(maxVal));
+
+ vst1_u16(dst + col, d0);
+ }
}
src += srcStride;
@@ -2190,7 +2454,16 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intp
}
}
-#else // HIGH_BIT_DEPTH
+#endif // !HIGH_BIT_DEPTH
+}
+
+namespace X265_NS
+{
+
+#if HIGH_BIT_DEPTH
+#define SHIFT_INTERP_PS (IF_FILTER_PREC - (IF_INTERNAL_PREC - X265_DEPTH))
+#endif
+
template<int N, int width, int height>
void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
intptr_t dstStride, int coeffIdx)
@@ -2226,8 +2499,6 @@ void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
}
}
-#endif // HIGH_BIT_DEPTH
-
#if HIGH_BIT_DEPTH
template<int N, int width, int height>
@@ -2676,11 +2947,64 @@ void interp_hv_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, intptr_
interp_vert_sp_neon<N, width, height>(immed + (N / 2 - 1) * width, width, dst, dstStride, idxY);
}
+template<int width, int height>
+void filterPixelToShort_neon(const pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride)
+{
+ const int shift = IF_INTERNAL_PREC - X265_DEPTH;
+ const int16x8_t off = vdupq_n_s16(IF_INTERNAL_OFFS);
+ for (int row = 0; row < height; row++)
+ {
+ int col = 0;
+ for (; col + 8 <= width; col += 8)
+ {
+ uint16x8_t in;
+#if HIGH_BIT_DEPTH
+ in = vld1q_u16(src + col);
+#else
+ in = vmovl_u8(vld1_u8(src + col));
+#endif
+ int16x8_t tmp = vreinterpretq_s16_u16(vshlq_n_u16(in, shift));
+ tmp = vsubq_s16(tmp, off);
+ vst1q_s16(dst + col, tmp);
+ }
+ for (; col + 4 <= width; col += 4)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ vst1_s16(dst + col, tmp);
+ }
+
+ for (; col < width; col += 2)
+ {
+ uint16x4_t in;
+
+#if HIGH_BIT_DEPTH
+ in = vld1_u16(src + col);
+#else
+ in = vget_low_u16(vmovl_u8(vld1_u8(src + col)));
+#endif
+ int16x4_t tmp = vreinterpret_s16_u16(vshl_n_u16(in, shift));
+ tmp = vsub_s16(tmp, vget_low_s16(off));
+ store_s16x2xn<1>(dst + col, dstStride, &tmp);
+ }
+
+ src += srcStride;
+ dst += dstStride;
+ }
+}
#define CHROMA_420(W, H) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].filter_hpp = interp_horiz_pp_neon<4, W, H>; \
@@ -2834,6 +3158,36 @@ void setupFilterPrimitives_neon(EncoderPrimitives &p)
CHROMA_444(64, 32);
CHROMA_444(64, 48);
CHROMA_444(64, 64);
+
+#if HIGH_BIT_DEPTH
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = interp_horiz_pp_neon<4, 2, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = interp_horiz_pp_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = interp_horiz_pp_neon<4, 4, 2>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_hpp = interp_horiz_pp_neon<4, 6, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hpp = interp_horiz_pp_neon<4, 2, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hpp = interp_horiz_pp_neon<4, 2, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_hpp = interp_horiz_pp_neon<4, 4, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_hpp = interp_horiz_pp_neon<4, 6, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_hpp = interp_horiz_pp_neon<4, 12, 32>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_hpp = interp_horiz_pp_neon<4, 4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_hpp = interp_horiz_pp_neon<4, 4, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_4x16].filter_hpp = interp_horiz_pp_neon<4, 4, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_hpp = interp_horiz_pp_neon<4, 12, 16>;
+
+ p.pu[LUMA_4x4].luma_hpp = interp_horiz_pp_neon<8, 4, 4>;
+ p.pu[LUMA_4x8].luma_hpp = interp_horiz_pp_neon<8, 4, 8>;
+ p.pu[LUMA_4x16].luma_hpp = interp_horiz_pp_neon<8, 4, 16>;
+ p.pu[LUMA_12x16].luma_hpp = interp_horiz_pp_neon<8, 12, 16>;
+#endif // HIGH_BIT_DEPTH
}
};
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 90788a938..2c6edfccb 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -1,7 +1,8 @@
/*****************************************************************************
- * Copyright (C) 2024 MulticoreWare, Inc
+ * Copyright (C) 2024-2025 MulticoreWare, Inc
*
* Authors: Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -185,6 +186,51 @@ static void inline load_s16x8xn(const int16_t *src, const intptr_t stride,
}
}
+template<int N>
+static void inline load_u16x4xn(const uint16_t *src, const intptr_t stride,
+ uint16x4_t *dst)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ dst[i] = vld1_u16(src);
+ src += stride;
+ }
+}
+
+template<int N>
+static void inline load_u16x8xn(const uint16_t *src, const intptr_t stride,
+ uint16x8_t *dst)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ dst[i] = vld1q_u16(src);
+ src += stride;
+ }
+}
+
+template<int N>
+static void inline store_u16x2xn(uint16_t *dst, intptr_t dst_stride,
+ const uint16x4_t *src)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u16(src[i]), 0);
+ dst += dst_stride;
+ }
+}
+
+template<int N>
+static void inline store_u16x6xn(uint16_t *dst, intptr_t dst_stride,
+ const uint16x8_t *src)
+{
+ for (int i = 0; i < N; ++i)
+ {
+ vst1_u16(dst, vget_low_u16(src[i]));
+ vst1q_lane_u32((uint32_t *)(dst + 4), vreinterpretq_u32_u16(src[i]), 2);
+ dst += dst_stride;
+ }
+}
+
template<int N>
static void inline store_s16x2xn(int16_t *dst, intptr_t dst_stride,
const int16x4_t *src)
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list