[x265] [PATCH 4/4] AArch64: Add SVE implementation of HBD interp_vert_pp
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Apr 15 09:38:22 UTC 2025
Add SVE implementation of HBD interp_vert_pp for CHROMA filtering.
This implementation is enabled only for 12 bit builds and gives up to
5% uplift compared to the existing Neon implementation. For 10 bit
builds the Neon implementation is used for these functions.
---
source/common/aarch64/filter-prim-sve.cpp | 313 ++++++++++++++++++++++
1 file changed, 313 insertions(+)
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index ba38dd960..03c041107 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -537,6 +537,241 @@ void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst
}
}
+void inline transpose_concat_u16_4x4(const uint16x4_t s[4], uint16x8_t res[2])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ uint16x8_t s0q = vcombine_u16(s[0], vdup_n_u16(0));
+ uint16x8_t s1q = vcombine_u16(s[1], vdup_n_u16(0));
+ uint16x8_t s2q = vcombine_u16(s[2], vdup_n_u16(0));
+ uint16x8_t s3q = vcombine_u16(s[3], vdup_n_u16(0));
+
+ uint16x8_t s02 = vzip1q_u16(s0q, s2q);
+ uint16x8_t s13 = vzip1q_u16(s1q, s3q);
+
+ uint16x8x2_t s0123 = vzipq_u16(s02, s13);
+
+ res[0] = s0123.val[0];
+ res[1] = s0123.val[1];
+}
+
+void inline transpose_concat_u16_8x4(const uint16x8_t s[4], uint16x8_t res[4])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ uint16x8x2_t s02 = vzipq_u16(s[0], s[2]);
+ uint16x8x2_t s13 = vzipq_u16(s[1], s[3]);
+
+ uint16x8x2_t s0123_lo = vzipq_u16(s02.val[0], s13.val[0]);
+ uint16x8x2_t s0123_hi = vzipq_u16(s02.val[1], s13.val[1]);
+
+ res[0] = s0123_lo.val[0];
+ res[1] = s0123_lo.val[1];
+ res[2] = s0123_hi.val[0];
+ res[3] = s0123_hi.val[1];
+}
+
+void inline insert_new_u16_elements_x8(uint16x8_t *s, uint16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ uint8x16x2_t samples_tbl[4];
+
+ samples_tbl[0].val[0] = vreinterpretq_u8_u16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_u8_u16(s_new);
+ s[0] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_u8_u16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_u8_u16(s_new);
+ s[1] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[1], merge_block_tbl[1]));
+
+ samples_tbl[2].val[0] = vreinterpretq_u8_u16(s[2]);
+ samples_tbl[2].val[1] = vreinterpretq_u8_u16(s_new);
+ s[2] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[2], merge_block_tbl[2]));
+
+ samples_tbl[3].val[0] = vreinterpretq_u8_u16(s[3]);
+ samples_tbl[3].val[1] = vreinterpretq_u8_u16(s_new);
+ s[3] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[3], merge_block_tbl[3]));
+}
+
+void inline insert_new_u16_elements_x4(uint16x8_t *s, uint16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ uint8x16x2_t samples_tbl[2];
+
+ samples_tbl[0].val[0] = vreinterpretq_u8_u16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_u8_u16(s_new);
+ s[0] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_u8_u16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_u8_u16(s_new);
+ s[1] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[1], merge_block_tbl[1]));
+}
+
+void inline filter4_u16x4(const uint16x8_t *s, const int16x8_t f2,
+ const int64x2_t offset, const uint16x4_t maxVal,
+ uint16x4_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[0]), f2);
+ int64x2_t sum1 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[1]), f2);
+
+ int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+}
+
+void inline filter4_u16x8(const uint16x8_t *s, const int16x8_t f2,
+ const int64x2_t offset, const uint16x8_t maxVal,
+ uint16x8_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[0]), f2);
+ int64x2_t sum1 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[1]), f2);
+ int64x2_t sum2 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[2]), f2);
+ int64x2_t sum3 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[3]), f2);
+
+ int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+ int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+ uint16x4_t d0 = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d1 = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+}
+
+template<int width, int height>
+void inline interp4_vpp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, const int16_t coeffIdx)
+{
+ const int N_TAPS = 4;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+ int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+ int16x8_t filter = vcombine_s16(f, f);
+ int64x2_t offset = vdupq_n_s64(0);
+ uint8x16_t merge_block_tbl[4];
+
+ merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0);
+ merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16);
+ merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32);
+ merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48);
+
+ src -= (N_TAPS / 2 - 1) * srcStride;
+
+ if (width % 8 != 0)
+ {
+ if (width == 12)
+ {
+ const int n_store = 8;
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+
+ uint16x8_t in[4];
+ load_u16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x8_t res[4];
+ filter4_u16x8(ss, filter, offset, maxVal, res[0]);
+
+ store_u16xnxm<n_store, 4>(d, dstStride, res);
+
+ uint16x8_t new_r = vld1q_u16(s);
+ insert_new_u16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ uint16x8_t res[4];
+ filter4_u16x8(ss, filter, offset, maxVal, res[0]);
+ store_u16xnxm<n_store, 4>(d, dstStride, res);
+
+ src += 8;
+ dst += 8;
+ }
+ const int n_store = width > 4 ? 4 : width;
+
+ uint16x4_t in[4];
+ load_u16x4xn<4>(src, srcStride, in);
+ src += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_4x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x4_t res;
+ filter4_u16x4(ss, filter, offset, vget_low_u16(maxVal), res);
+
+ store_u16xnxm<n_store, 1>(dst, dstStride, &res);
+
+ uint16x8_t new_r = vld1q_u16(src);
+ insert_new_u16_elements_x4(ss, new_r, merge_block_tbl);
+
+ src += srcStride;
+ dst += dstStride;
+ }
+
+ uint16x4_t res;
+ filter4_u16x4(ss, filter, offset, vget_low_u16(maxVal), res);
+ store_u16xnxm<n_store, 1>(dst, dstStride, &res);
+ }
+ else
+ {
+ for (int col = 0; col < width; col += 8)
+ {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+
+ uint16x8_t in[4];
+ load_u16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_8x4(in, ss);
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x8_t res;
+ filter4_u16x8(ss, filter, offset, maxVal, res);
+
+ vst1q_u16(d, res);
+
+ uint16x8_t new_r = vld1q_u16(s);
+ insert_new_u16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ uint16x8_t res;
+ filter4_u16x8(ss, filter, offset, maxVal, res);
+ vst1q_u16(d, res);
+
+ src += 8;
+ dst += 8;
+ }
+ }
+}
+
namespace X265_NS {
// Declaration for use in interp8_horiz_pp_sve().
template<int N, int width, int height>
@@ -620,6 +855,26 @@ void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
}
}
+// Declaration for use in interp4_vert_pp_sve().
+template<int N, int width, int height>
+void interp_vert_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp4_vert_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp_vert_pp_neon<4, width, height>(src, srcStride, dst,
+ dstStride, coeffIdx);
+ default:
+ return interp4_vpp_sve<width, height>(src, srcStride, dst,
+ dstStride, coeffIdx);
+ }
+}
+
void setupFilterPrimitives_sve(EncoderPrimitives &p)
{
p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>;
@@ -698,6 +953,64 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>;
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>;
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>;
+
+#if X265_DEPTH == 12
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = interp4_vert_pp_sve<8, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = interp4_vert_pp_sve<8, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = interp4_vert_pp_sve<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = interp4_vert_pp_sve<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = interp4_vert_pp_sve<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = interp4_vert_pp_sve<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = interp4_vert_pp_sve<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = interp4_vert_pp_sve<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = interp4_vert_pp_sve<8, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = interp4_vert_pp_sve<8, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vpp = interp4_vert_pp_sve<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = interp4_vert_pp_sve<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = interp4_vert_pp_sve<16, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = interp4_vert_pp_sve<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = interp4_vert_pp_sve<32, 48>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = interp4_vert_pp_sve<32, 64>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = interp4_vert_pp_sve<8, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = interp4_vert_pp_sve<16, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = interp4_vert_pp_sve<16, 12>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = interp4_vert_pp_sve<16, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = interp4_vert_pp_sve<24, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = interp4_vert_pp_sve<32, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = interp4_vert_pp_sve<32, 24>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = interp4_vert_pp_sve<32, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = interp4_vert_pp_sve<48, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = interp4_vert_pp_sve<64, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = interp4_vert_pp_sve<64, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = interp4_vert_pp_sve<64, 48>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = interp4_vert_pp_sve<64, 64>;
+#endif // if X265_DEPTH == 12
}
} // namespace X265_NS
#else // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From e26e437a98ce94118e42148d41b27ed6d56b3163 Mon Sep 17 00:00:00 2001
Message-Id: <e26e437a98ce94118e42148d41b27ed6d56b3163.1744709613.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1744709613.git.gerdazsejke.more at arm.com>
References: <cover.1744709613.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Mon, 31 Mar 2025 01:16:21 +0200
Subject: [PATCH 4/4] AArch64: Add SVE implementation of HBD interp_vert_pp
Add SVE implementation of HBD interp_vert_pp for CHROMA filtering.
This implementation is enabled only for 12 bit builds and gives up to
5% uplift compared to the existing Neon implementation. For 10 bit
builds the Neon implementation is used for these functions.
---
source/common/aarch64/filter-prim-sve.cpp | 313 ++++++++++++++++++++++
1 file changed, 313 insertions(+)
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index ba38dd960..03c041107 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -537,6 +537,241 @@ void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst
}
}
+void inline transpose_concat_u16_4x4(const uint16x4_t s[4], uint16x8_t res[2])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ uint16x8_t s0q = vcombine_u16(s[0], vdup_n_u16(0));
+ uint16x8_t s1q = vcombine_u16(s[1], vdup_n_u16(0));
+ uint16x8_t s2q = vcombine_u16(s[2], vdup_n_u16(0));
+ uint16x8_t s3q = vcombine_u16(s[3], vdup_n_u16(0));
+
+ uint16x8_t s02 = vzip1q_u16(s0q, s2q);
+ uint16x8_t s13 = vzip1q_u16(s1q, s3q);
+
+ uint16x8x2_t s0123 = vzipq_u16(s02, s13);
+
+ res[0] = s0123.val[0];
+ res[1] = s0123.val[1];
+}
+
+void inline transpose_concat_u16_8x4(const uint16x8_t s[4], uint16x8_t res[4])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ uint16x8x2_t s02 = vzipq_u16(s[0], s[2]);
+ uint16x8x2_t s13 = vzipq_u16(s[1], s[3]);
+
+ uint16x8x2_t s0123_lo = vzipq_u16(s02.val[0], s13.val[0]);
+ uint16x8x2_t s0123_hi = vzipq_u16(s02.val[1], s13.val[1]);
+
+ res[0] = s0123_lo.val[0];
+ res[1] = s0123_lo.val[1];
+ res[2] = s0123_hi.val[0];
+ res[3] = s0123_hi.val[1];
+}
+
+void inline insert_new_u16_elements_x8(uint16x8_t *s, uint16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ uint8x16x2_t samples_tbl[4];
+
+ samples_tbl[0].val[0] = vreinterpretq_u8_u16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_u8_u16(s_new);
+ s[0] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_u8_u16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_u8_u16(s_new);
+ s[1] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[1], merge_block_tbl[1]));
+
+ samples_tbl[2].val[0] = vreinterpretq_u8_u16(s[2]);
+ samples_tbl[2].val[1] = vreinterpretq_u8_u16(s_new);
+ s[2] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[2], merge_block_tbl[2]));
+
+ samples_tbl[3].val[0] = vreinterpretq_u8_u16(s[3]);
+ samples_tbl[3].val[1] = vreinterpretq_u8_u16(s_new);
+ s[3] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[3], merge_block_tbl[3]));
+}
+
+void inline insert_new_u16_elements_x4(uint16x8_t *s, uint16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ uint8x16x2_t samples_tbl[2];
+
+ samples_tbl[0].val[0] = vreinterpretq_u8_u16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_u8_u16(s_new);
+ s[0] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_u8_u16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_u8_u16(s_new);
+ s[1] = vreinterpretq_u16_u8(vqtbl2q_u8(samples_tbl[1], merge_block_tbl[1]));
+}
+
+void inline filter4_u16x4(const uint16x8_t *s, const int16x8_t f2,
+ const int64x2_t offset, const uint16x4_t maxVal,
+ uint16x4_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[0]), f2);
+ int64x2_t sum1 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[1]), f2);
+
+ int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+
+ d = vqrshrun_n_s32(sum, IF_FILTER_PREC);
+ d = vmin_u16(d, maxVal);
+}
+
+void inline filter4_u16x8(const uint16x8_t *s, const int16x8_t f2,
+ const int64x2_t offset, const uint16x8_t maxVal,
+ uint16x8_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[0]), f2);
+ int64x2_t sum1 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[1]), f2);
+ int64x2_t sum2 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[2]), f2);
+ int64x2_t sum3 = x265_sdotq_s16(offset, vreinterpretq_s16_u16(s[3]), f2);
+
+ int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+ int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+ uint16x4_t d0 = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC);
+ uint16x4_t d1 = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vminq_u16(vcombine_u16(d0, d1), maxVal);
+}
+
+template<int width, int height>
+void inline interp4_vpp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, const int16_t coeffIdx)
+{
+ const int N_TAPS = 4;
+ const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1);
+ int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+ int16x8_t filter = vcombine_s16(f, f);
+ int64x2_t offset = vdupq_n_s64(0);
+ uint8x16_t merge_block_tbl[4];
+
+ merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0);
+ merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16);
+ merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32);
+ merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48);
+
+ src -= (N_TAPS / 2 - 1) * srcStride;
+
+ if (width % 8 != 0)
+ {
+ if (width == 12)
+ {
+ const int n_store = 8;
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+
+ uint16x8_t in[4];
+ load_u16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x8_t res[4];
+ filter4_u16x8(ss, filter, offset, maxVal, res[0]);
+
+ store_u16xnxm<n_store, 4>(d, dstStride, res);
+
+ uint16x8_t new_r = vld1q_u16(s);
+ insert_new_u16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ uint16x8_t res[4];
+ filter4_u16x8(ss, filter, offset, maxVal, res[0]);
+ store_u16xnxm<n_store, 4>(d, dstStride, res);
+
+ src += 8;
+ dst += 8;
+ }
+ const int n_store = width > 4 ? 4 : width;
+
+ uint16x4_t in[4];
+ load_u16x4xn<4>(src, srcStride, in);
+ src += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_4x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x4_t res;
+ filter4_u16x4(ss, filter, offset, vget_low_u16(maxVal), res);
+
+ store_u16xnxm<n_store, 1>(dst, dstStride, &res);
+
+ uint16x8_t new_r = vld1q_u16(src);
+ insert_new_u16_elements_x4(ss, new_r, merge_block_tbl);
+
+ src += srcStride;
+ dst += dstStride;
+ }
+
+ uint16x4_t res;
+ filter4_u16x4(ss, filter, offset, vget_low_u16(maxVal), res);
+ store_u16xnxm<n_store, 1>(dst, dstStride, &res);
+ }
+ else
+ {
+ for (int col = 0; col < width; col += 8)
+ {
+ const uint16_t *s = src;
+ uint16_t *d = dst;
+
+ uint16x8_t in[4];
+ load_u16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ uint16x8_t ss[4];
+ transpose_concat_u16_8x4(in, ss);
+ for (int row = 0; row < height - 1; ++row)
+ {
+ uint16x8_t res;
+ filter4_u16x8(ss, filter, offset, maxVal, res);
+
+ vst1q_u16(d, res);
+
+ uint16x8_t new_r = vld1q_u16(s);
+ insert_new_u16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ uint16x8_t res;
+ filter4_u16x8(ss, filter, offset, maxVal, res);
+ vst1q_u16(d, res);
+
+ src += 8;
+ dst += 8;
+ }
+ }
+}
+
namespace X265_NS {
// Declaration for use in interp8_horiz_pp_sve().
template<int N, int width, int height>
@@ -620,6 +855,26 @@ void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
}
}
+// Declaration for use in interp4_vert_pp_sve().
+template<int N, int width, int height>
+void interp_vert_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp4_vert_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp_vert_pp_neon<4, width, height>(src, srcStride, dst,
+ dstStride, coeffIdx);
+ default:
+ return interp4_vpp_sve<width, height>(src, srcStride, dst,
+ dstStride, coeffIdx);
+ }
+}
+
void setupFilterPrimitives_sve(EncoderPrimitives &p)
{
p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>;
@@ -698,6 +953,64 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>;
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>;
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>;
+
+#if X265_DEPTH == 12
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = interp4_vert_pp_sve<8, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = interp4_vert_pp_sve<8, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = interp4_vert_pp_sve<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = interp4_vert_pp_sve<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = interp4_vert_pp_sve<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = interp4_vert_pp_sve<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = interp4_vert_pp_sve<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vpp = interp4_vert_pp_sve<8, 12>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = interp4_vert_pp_sve<8, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = interp4_vert_pp_sve<8, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vpp = interp4_vert_pp_sve<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = interp4_vert_pp_sve<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = interp4_vert_pp_sve<16, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = interp4_vert_pp_sve<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = interp4_vert_pp_sve<32, 48>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = interp4_vert_pp_sve<32, 64>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = interp4_vert_pp_sve<4, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vpp = interp4_vert_pp_sve<8, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = interp4_vert_pp_sve<8, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = interp4_vert_pp_sve<8, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = interp4_vert_pp_sve<16, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = interp4_vert_pp_sve<16, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = interp4_vert_pp_sve<16, 12>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = interp4_vert_pp_sve<16, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = interp4_vert_pp_sve<16, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = interp4_vert_pp_sve<16, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = interp4_vert_pp_sve<24, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = interp4_vert_pp_sve<32, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = interp4_vert_pp_sve<32, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = interp4_vert_pp_sve<32, 24>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = interp4_vert_pp_sve<32, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = interp4_vert_pp_sve<32, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = interp4_vert_pp_sve<48, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = interp4_vert_pp_sve<64, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = interp4_vert_pp_sve<64, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = interp4_vert_pp_sve<64, 48>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = interp4_vert_pp_sve<64, 64>;
+#endif // if X265_DEPTH == 12
}
} // namespace X265_NS
#else // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list