[x265] [PATCH 03/14] AArch64: Add Armv8.4 Neon DotProd implementations of luma_hps
Hari Limaye
hari.limaye at arm.com
Fri Sep 6 13:33:12 UTC 2024
Add implementations of luma_hps primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.
Geomean uplift across all block sizes for luma filters, relative to
Armv8.0 Neon implementation:
Neoverse N1: 1.421x
Neoverse N2: 1.275x
Neoverse V1: 1.375x
Neoverse V2: 1.387x
---
source/common/aarch64/filter-neon-dotprod.cpp | 198 +++++++++++++++++-
1 file changed, 197 insertions(+), 1 deletion(-)
diff --git a/source/common/aarch64/filter-neon-dotprod.cpp b/source/common/aarch64/filter-neon-dotprod.cpp
index 31269d300..4cd052fb0 100644
--- a/source/common/aarch64/filter-neon-dotprod.cpp
+++ b/source/common/aarch64/filter-neon-dotprod.cpp
@@ -108,6 +108,84 @@ uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter,
vmovn_s32(dotprod_hi));
return vqrshrun_n_s16(dotprod, IF_FILTER_PREC);
}
+
+int16x4_t inline filter8_4_ps(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16x3_t tbl)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x16_t samples_s8 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+
+ // Correction accounting for sample range transform cancels to 0.
+ int32x4_t constant = vdupq_n_s32(0);
+ int32x4_t dotprod = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+ dotprod = vdotq_lane_s32(dotprod, perm_samples_1, filter, 1);
+
+ // Narrow.
+ return vmovn_s32(dotprod);
+}
+
+int16x8_t inline filter8_8_ps(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16x3_t tbl)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x16_t samples_s8 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ int8x16_t perm_samples_0 = vqtbl1q_s8(samples_s8, tbl.val[0]);
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val[2]);
+
+ // Correction accounting for sample range transform cancels to 0.
+ int32x4_t constant = vdupq_n_s32(0);
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+ // Narrow and combine.
+ return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
+
+int16x8_t inline filter8_8_ps_reuse(uint8x16_t samples, const int8x8_t filter,
+ const uint8x16x3_t tbl,
+ int8x16_t &perm_samples_0)
+{
+ // Transform sample range from uint8_t to int8_t for signed dot product.
+ int8x16_t samples_s8 =
+ vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
+
+ // Permute input samples for dot product.
+ // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
+ // Already in perm_samples_0.
+ // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
+ int8x16_t perm_samples_1 = vqtbl1q_s8(samples_s8, tbl.val[1]);
+ // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
+ int8x16_t perm_samples_2 = vqtbl1q_s8(samples_s8, tbl.val[2]);
+
+ // Correction accounting for sample range transform cancels to 0.
+ int32x4_t constant = vdupq_n_s32(0);
+ int32x4_t dotprod_lo = vdotq_lane_s32(constant, perm_samples_0, filter, 0);
+ int32x4_t dotprod_hi = vdotq_lane_s32(constant, perm_samples_1, filter, 0);
+ dotprod_lo = vdotq_lane_s32(dotprod_lo, perm_samples_1, filter, 1);
+ dotprod_hi = vdotq_lane_s32(dotprod_hi, perm_samples_2, filter, 1);
+
+ // Save for re-use in next iteration.
+ perm_samples_0 = perm_samples_2;
+
+ // Narrow and combine.
+ return vcombine_s16(vmovn_s32(dotprod_lo), vmovn_s32(dotprod_hi));
+}
} // Unnamed namespace.
namespace X265_NS {
@@ -193,8 +271,126 @@ void interp8_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride,
}
}
+template<int width, int height>
+void interp8_horiz_ps_dotprod(const uint8_t *src, intptr_t srcStride,
+ int16_t *dst, intptr_t dstStride, int coeffIdx,
+ int isRowExt)
+{
+ const int N_TAPS = 8;
+ int blkheight = height;
+
+ src -= N_TAPS / 2 - 1;
+ if (isRowExt)
+ {
+ src -= (N_TAPS / 2 - 1) * srcStride;
+ blkheight += N_TAPS - 1;
+ }
+
+ const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl);
+ const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFilter[coeffIdx]));
+
+ for (int row = 0; row + 4 <= blkheight; row += 4)
+ {
+ int col = 0;
+ if (width >= 32)
+ {
+ // Peel first sample permute to enable passing between iterations.
+ uint8x8_t s0[4];
+ load_u8x8xn<4>(src, srcStride, s0);
+ int8x16_t ps0[4];
+ init_sample_permute(s0, tbl, ps0);
+
+ for (; col + 16 <= width; col += 16)
+ {
+ uint8x16_t s_lo[4], s_hi[4];
+ load_u8x16xn<4>(src + col + 0, srcStride, s_lo);
+ load_u8x16xn<4>(src + col + 8, srcStride, s_hi);
+
+ int16x8_t d_lo[4];
+ d_lo[0] = filter8_8_ps_reuse(s_lo[0], filter, tbl, ps0[0]);
+ d_lo[1] = filter8_8_ps_reuse(s_lo[1], filter, tbl, ps0[1]);
+ d_lo[2] = filter8_8_ps_reuse(s_lo[2], filter, tbl, ps0[2]);
+ d_lo[3] = filter8_8_ps_reuse(s_lo[3], filter, tbl, ps0[3]);
+
+ int16x8_t d_hi[4];
+ d_hi[0] = filter8_8_ps_reuse(s_hi[0], filter, tbl, ps0[0]);
+ d_hi[1] = filter8_8_ps_reuse(s_hi[1], filter, tbl, ps0[1]);
+ d_hi[2] = filter8_8_ps_reuse(s_hi[2], filter, tbl, ps0[2]);
+ d_hi[3] = filter8_8_ps_reuse(s_hi[3], filter, tbl, ps0[3]);
+
+ store_s16x8xn<4>(dst + col + 0, dstStride, d_lo);
+ store_s16x8xn<4>(dst + col + 8, dstStride, d_hi);
+ }
+ }
+ else
+ {
+ for (; col + 8 <= width; col += 8)
+ {
+ uint8x16_t s[4];
+ load_u8x16xn<4>(src + col, srcStride, s);
+
+ int16x8_t d[4];
+ d[0] = filter8_8_ps(s[0], filter, tbl);
+ d[1] = filter8_8_ps(s[1], filter, tbl);
+ d[2] = filter8_8_ps(s[2], filter, tbl);
+ d[3] = filter8_8_ps(s[3], filter, tbl);
+
+ store_s16x8xn<4>(dst + col, dstStride, d);
+ }
+ }
+ for (; col < width; col += 4)
+ {
+ uint8x16_t s[4];
+ load_u8x16xn<4>(src + col, srcStride, s);
+
+ int16x4_t d[4];
+ d[0] = filter8_4_ps(s[0], filter, tbl);
+ d[1] = filter8_4_ps(s[1], filter, tbl);
+ d[2] = filter8_4_ps(s[2], filter, tbl);
+ d[3] = filter8_4_ps(s[3], filter, tbl);
+
+ store_s16x4xn<4>(dst + col, dstStride, d);
+ }
+
+ src += 4 * srcStride;
+ dst += 4 * dstStride;
+ }
+
+ if (isRowExt)
+ {
+ // Process final 3 rows.
+ int col = 0;
+ for (; (col + 8) <= width; col += 8)
+ {
+ uint8x16_t s[3];
+ load_u8x16xn<3>(src + col, srcStride, s);
+
+ int16x8_t d[3];
+ d[0] = filter8_8_ps(s[0], filter, tbl);
+ d[1] = filter8_8_ps(s[1], filter, tbl);
+ d[2] = filter8_8_ps(s[2], filter, tbl);
+
+ store_s16x8xn<3>(dst + col, dstStride, d);
+ }
+
+ for (; col < width; col += 4)
+ {
+ uint8x16_t s[3];
+ load_u8x16xn<3>(src + col, srcStride, s);
+
+ int16x4_t d[3];
+ d[0] = filter8_4_ps(s[0], filter, tbl);
+ d[1] = filter8_4_ps(s[1], filter, tbl);
+ d[2] = filter8_4_ps(s[2], filter, tbl);
+
+ store_s16x4xn<3>(dst + col, dstStride, d);
+ }
+ }
+}
+
#define LUMA_DOTPROD(W, H) \
- p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_dotprod<W, H>;
+ p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_dotprod<W, H>; \
+ p.pu[LUMA_ ## W ## x ## H].luma_hps = interp8_horiz_ps_dotprod<W, H>;
void setupFilterPrimitives_neon_dotprod(EncoderPrimitives &p)
{
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-AArch64-Add-Armv8.4-Neon-DotProd-implementations-of-.patch
Type: text/x-patch
Size: 9504 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240906/76644981/attachment.bin>
More information about the x265-devel
mailing list