[x265] [PATCH 3/4] AArch64: Add SVE implementation of HBD interp_vert_ss
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Apr 15 09:37:46 UTC 2025
Add SVE implementation of HBD interp_vert_ss for block sizes with
width >= 8 for CHROMA filtering.
This implementation gives up to 16% uplift compared to the existing
Neon implementation.
---
source/common/aarch64/filter-prim-sve.cpp | 308 ++++++++++++++++++++++
1 file changed, 308 insertions(+)
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index b710cd238..ba38dd960 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -39,6 +39,13 @@ static const uint16_t dotprod_h_permute_tbl[32] = {
// clang-format on
};
+static const uint8_t dotprod_v_permute_tbl[80] = {
+ 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19,
+ 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23,
+ 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27,
+ 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31,
+};
+
template<bool coeff2>
void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter,
uint16x4_t maxVal)
@@ -299,6 +306,237 @@ void inline interp8_hps_sve(const pixel *src, intptr_t srcStride,
}
}
+void inline transpose_concat_s16_4x4(const int16x4_t s[4], int16x8_t res[2])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ int16x8_t s0q = vcombine_s16(s[0], vdup_n_s16(0));
+ int16x8_t s1q = vcombine_s16(s[1], vdup_n_s16(0));
+ int16x8_t s2q = vcombine_s16(s[2], vdup_n_s16(0));
+ int16x8_t s3q = vcombine_s16(s[3], vdup_n_s16(0));
+
+ int16x8_t s02 = vzip1q_s16(s0q, s2q);
+ int16x8_t s13 = vzip1q_s16(s1q, s3q);
+
+ int16x8x2_t s0123 = vzipq_s16(s02, s13);
+
+ res[0] = s0123.val[0];
+ res[1] = s0123.val[1];
+}
+
+void inline transpose_concat_s16_8x4(const int16x8_t s[4], int16x8_t res[4])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ int16x8x2_t s02 = vzipq_s16(s[0], s[2]);
+ int16x8x2_t s13 = vzipq_s16(s[1], s[3]);
+
+ int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]);
+ int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]);
+
+ res[0] = s0123_lo.val[0];
+ res[1] = s0123_lo.val[1];
+ res[2] = s0123_hi.val[0];
+ res[3] = s0123_hi.val[1];
+}
+
+void inline insert_new_s16_elements_x8(int16x8_t *s, int16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ int8x16x2_t samples_tbl[4];
+
+ samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new);
+ s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new);
+ s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1]));
+
+ samples_tbl[2].val[0] = vreinterpretq_s8_s16(s[2]);
+ samples_tbl[2].val[1] = vreinterpretq_s8_s16(s_new);
+ s[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[2], merge_block_tbl[2]));
+
+ samples_tbl[3].val[0] = vreinterpretq_s8_s16(s[3]);
+ samples_tbl[3].val[1] = vreinterpretq_s8_s16(s_new);
+ s[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[3], merge_block_tbl[3]));
+}
+
+void inline insert_new_s16_elements_x4(int16x8_t *s, int16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ int8x16x2_t samples_tbl[2];
+
+ samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new);
+ s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new);
+ s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1]));
+}
+
+void inline filter4_s16x4(const int16x8_t *ss, const int16x8_t filter,
+ const int64x2_t offset, int16x4_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter);
+ int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter);
+ int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+
+ d = vshrn_n_s32(sum, IF_FILTER_PREC);
+}
+
+void inline filter4_s16x8(const int16x8_t *ss, const int16x8_t filter,
+ const int64x2_t offset, int16x8_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter);
+ int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter);
+ int64x2_t sum2 = x265_sdotq_s16(offset, ss[2], filter);
+ int64x2_t sum3 = x265_sdotq_s16(offset, ss[3], filter);
+
+ int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+ int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+ int16x4_t d0 = vshrn_n_s32(sum_lo, IF_FILTER_PREC);
+ int16x4_t d1 = vshrn_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vcombine_s16(d0, d1);
+}
+
+template<int width, int height>
+void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, const int16_t coeffIdx)
+{
+ const int N_TAPS = 4;
+ int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+ int16x8_t filter = vcombine_s16(f, f);
+ int64x2_t offset = vdupq_n_s64(0);
+ uint8x16_t merge_block_tbl[4];
+
+ merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0);
+ merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16);
+ merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32);
+ merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48);
+
+ src -= (N_TAPS / 2 - 1) * srcStride;
+
+ if (width % 8 != 0)
+ {
+ if (width == 12)
+ {
+ const int n_store = 8;
+ const int16_t *s = src;
+ int16_t *d = dst;
+
+ int16x8_t in[4];
+ load_s16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ int16x8_t ss[4];
+ transpose_concat_s16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+
+ store_s16xnxm<n_store, 4>(&res, d, dstStride);
+
+ int16x8_t new_r = vld1q_s16(s);
+ insert_new_s16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+ store_s16xnxm<n_store, 4>(&res, d, dstStride);
+
+ src += 8;
+ dst += 8;
+ }
+ const int n_store = width > 4 ? 4 : width;
+
+ int16x4_t in[4];
+ load_s16x4xn<4>(src, srcStride, in);
+ src += 4 * srcStride;
+
+ int16x8_t ss[2];
+ transpose_concat_s16_4x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x4_t res;
+ filter4_s16x4(ss, filter, offset, res);
+
+ store_s16xnxm<n_store, 1>(&res, dst, dstStride);
+
+ int16x8_t new_r = vld1q_s16(src);
+ insert_new_s16_elements_x4(ss, new_r, merge_block_tbl);
+
+ src += srcStride;
+ dst += dstStride;
+ }
+
+ int16x4_t res;
+ filter4_s16x4(ss, filter, offset, res);
+ store_s16xnxm<n_store, 1>(&res, dst, dstStride);
+ }
+ else
+ {
+ for (int col = 0; col < width; col += 8)
+ {
+ const int16_t *s = src;
+ int16_t *d = dst;
+
+ int16x8_t in[4];
+ load_s16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ int16x8_t ss[4];
+ transpose_concat_s16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+
+ vst1q_s16(d, res);
+
+ int16x8_t new_r = vld1q_s16(s);
+ insert_new_s16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+ vst1q_s16(d, res);
+
+ src += 8;
+ dst += 8;
+ }
+ }
+}
+
namespace X265_NS {
// Declaration for use in interp8_horiz_pp_sve().
template<int N, int width, int height>
@@ -362,6 +600,26 @@ void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst,
}
}
+// Declaration for use in interp4_vert_ss_sve().
+template<int N, int width, int height>
+void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp_vert_ss_neon<4, width, height>(src, srcStride, dst, dstStride,
+ coeffIdx);
+ default:
+ return interp4_vss_sve<width, height>(src, srcStride, dst, dstStride,
+ coeffIdx);
+ }
+}
+
void setupFilterPrimitives_sve(EncoderPrimitives &p)
{
p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>;
@@ -390,6 +648,56 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
p.pu[LUMA_4x4].luma_hps = interp8_horiz_ps_sve<4, 4>;
p.pu[LUMA_4x8].luma_hps = interp8_horiz_ps_sve<4, 8>;
p.pu[LUMA_4x16].luma_hps = interp8_horiz_ps_sve<4, 16>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = interp4_vert_ss_sve<12, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = interp4_vert_ss_sve<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = interp4_vert_ss_sve<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = interp4_vert_ss_sve<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = interp4_vert_ss_sve<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = interp4_vert_ss_sve<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = interp4_vert_ss_sve<8, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = interp4_vert_ss_sve<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = interp4_vert_ss_sve<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = interp4_vert_ss_sve<16, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = interp4_vert_ss_sve<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = interp4_vert_ss_sve<32, 48>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = interp4_vert_ss_sve<32, 64>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = interp4_vert_ss_sve<12, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = interp4_vert_ss_sve<16, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = interp4_vert_ss_sve<16, 12>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = interp4_vert_ss_sve<16, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = interp4_vert_ss_sve<24, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = interp4_vert_ss_sve<32, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = interp4_vert_ss_sve<32, 24>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = interp4_vert_ss_sve<32, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = interp4_vert_ss_sve<48, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = interp4_vert_ss_sve<64, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>;
}
} // namespace X265_NS
#else // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 8edd950892a98536d858ae2c926b490cf17a7880 Mon Sep 17 00:00:00 2001
Message-Id: <8edd950892a98536d858ae2c926b490cf17a7880.1744709613.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1744709613.git.gerdazsejke.more at arm.com>
References: <cover.1744709613.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Mon, 31 Mar 2025 00:23:31 +0200
Subject: [PATCH 3/4] AArch64: Add SVE implementation of HBD interp_vert_ss
Add SVE implementation of HBD interp_vert_ss for block sizes with
width >= 8 for CHROMA filtering.
This implementation gives up to 16% uplift compared to the existing
Neon implementation.
---
source/common/aarch64/filter-prim-sve.cpp | 308 ++++++++++++++++++++++
1 file changed, 308 insertions(+)
diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp
index b710cd238..ba38dd960 100644
--- a/source/common/aarch64/filter-prim-sve.cpp
+++ b/source/common/aarch64/filter-prim-sve.cpp
@@ -39,6 +39,13 @@ static const uint16_t dotprod_h_permute_tbl[32] = {
// clang-format on
};
+static const uint8_t dotprod_v_permute_tbl[80] = {
+ 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 18, 19,
+ 2, 3, 4, 5, 6, 7, 20, 21, 10, 11, 12, 13, 14, 15, 22, 23,
+ 2, 3, 4, 5, 6, 7, 24, 25, 10, 11, 12, 13, 14, 15, 26, 27,
+ 2, 3, 4, 5, 6, 7, 28, 29, 10, 11, 12, 13, 14, 15, 30, 31,
+};
+
template<bool coeff2>
void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter,
uint16x4_t maxVal)
@@ -299,6 +306,237 @@ void inline interp8_hps_sve(const pixel *src, intptr_t srcStride,
}
}
+void inline transpose_concat_s16_4x4(const int16x4_t s[4], int16x8_t res[2])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03
+ // s1: 10, 11, 12, 13
+ // s2: 20, 21, 22, 23
+ // s3: 30, 31, 32, 33
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+
+ int16x8_t s0q = vcombine_s16(s[0], vdup_n_s16(0));
+ int16x8_t s1q = vcombine_s16(s[1], vdup_n_s16(0));
+ int16x8_t s2q = vcombine_s16(s[2], vdup_n_s16(0));
+ int16x8_t s3q = vcombine_s16(s[3], vdup_n_s16(0));
+
+ int16x8_t s02 = vzip1q_s16(s0q, s2q);
+ int16x8_t s13 = vzip1q_s16(s1q, s3q);
+
+ int16x8x2_t s0123 = vzipq_s16(s02, s13);
+
+ res[0] = s0123.val[0];
+ res[1] = s0123.val[1];
+}
+
+void inline transpose_concat_s16_8x4(const int16x8_t s[4], int16x8_t res[4])
+{
+ // Transpose 16-bit elements:
+ // s0: 00, 01, 02, 03, 04, 05, 06, 07
+ // s1: 10, 11, 12, 13, 14, 15, 16, 17
+ // s2: 20, 21, 22, 23, 24, 25, 26, 27
+ // s3: 30, 31, 32, 33, 34, 35, 36, 37
+ //
+ // res[0]: 00 10 20 30 01 11 21 31
+ // res[1]: 02 12 22 32 03 13 23 33
+ // res[2]: 04 14 24 34 05 15 25 35
+ // res[3]: 06 16 26 36 07 17 27 37
+
+ int16x8x2_t s02 = vzipq_s16(s[0], s[2]);
+ int16x8x2_t s13 = vzipq_s16(s[1], s[3]);
+
+ int16x8x2_t s0123_lo = vzipq_s16(s02.val[0], s13.val[0]);
+ int16x8x2_t s0123_hi = vzipq_s16(s02.val[1], s13.val[1]);
+
+ res[0] = s0123_lo.val[0];
+ res[1] = s0123_lo.val[1];
+ res[2] = s0123_hi.val[0];
+ res[3] = s0123_hi.val[1];
+}
+
+void inline insert_new_s16_elements_x8(int16x8_t *s, int16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ int8x16x2_t samples_tbl[4];
+
+ samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new);
+ s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new);
+ s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1]));
+
+ samples_tbl[2].val[0] = vreinterpretq_s8_s16(s[2]);
+ samples_tbl[2].val[1] = vreinterpretq_s8_s16(s_new);
+ s[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[2], merge_block_tbl[2]));
+
+ samples_tbl[3].val[0] = vreinterpretq_s8_s16(s[3]);
+ samples_tbl[3].val[1] = vreinterpretq_s8_s16(s_new);
+ s[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[3], merge_block_tbl[3]));
+}
+
+void inline insert_new_s16_elements_x4(int16x8_t *s, int16x8_t s_new,
+ uint8x16_t *merge_block_tbl)
+{
+ int8x16x2_t samples_tbl[2];
+
+ samples_tbl[0].val[0] = vreinterpretq_s8_s16(s[0]);
+ samples_tbl[0].val[1] = vreinterpretq_s8_s16(s_new);
+ s[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[0], merge_block_tbl[0]));
+
+ samples_tbl[1].val[0] = vreinterpretq_s8_s16(s[1]);
+ samples_tbl[1].val[1] = vreinterpretq_s8_s16(s_new);
+ s[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_tbl[1], merge_block_tbl[1]));
+}
+
+void inline filter4_s16x4(const int16x8_t *ss, const int16x8_t filter,
+ const int64x2_t offset, int16x4_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter);
+ int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter);
+ int32x4_t sum = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+
+ d = vshrn_n_s32(sum, IF_FILTER_PREC);
+}
+
+void inline filter4_s16x8(const int16x8_t *ss, const int16x8_t filter,
+ const int64x2_t offset, int16x8_t &d)
+{
+ int64x2_t sum0 = x265_sdotq_s16(offset, ss[0], filter);
+ int64x2_t sum1 = x265_sdotq_s16(offset, ss[1], filter);
+ int64x2_t sum2 = x265_sdotq_s16(offset, ss[2], filter);
+ int64x2_t sum3 = x265_sdotq_s16(offset, ss[3], filter);
+
+ int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1));
+ int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3));
+
+ int16x4_t d0 = vshrn_n_s32(sum_lo, IF_FILTER_PREC);
+ int16x4_t d1 = vshrn_n_s32(sum_hi, IF_FILTER_PREC);
+
+ d = vcombine_s16(d0, d1);
+}
+
+template<int width, int height>
+void inline interp4_vss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, const int16_t coeffIdx)
+{
+ const int N_TAPS = 4;
+ int16x4_t f = vld1_s16(X265_NS::g_chromaFilter[coeffIdx]);
+ int16x8_t filter = vcombine_s16(f, f);
+ int64x2_t offset = vdupq_n_s64(0);
+ uint8x16_t merge_block_tbl[4];
+
+ merge_block_tbl[0] = vld1q_u8(dotprod_v_permute_tbl + 0);
+ merge_block_tbl[1] = vld1q_u8(dotprod_v_permute_tbl + 16);
+ merge_block_tbl[2] = vld1q_u8(dotprod_v_permute_tbl + 32);
+ merge_block_tbl[3] = vld1q_u8(dotprod_v_permute_tbl + 48);
+
+ src -= (N_TAPS / 2 - 1) * srcStride;
+
+ if (width % 8 != 0)
+ {
+ if (width == 12)
+ {
+ const int n_store = 8;
+ const int16_t *s = src;
+ int16_t *d = dst;
+
+ int16x8_t in[4];
+ load_s16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ int16x8_t ss[4];
+ transpose_concat_s16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+
+ store_s16xnxm<n_store, 4>(&res, d, dstStride);
+
+ int16x8_t new_r = vld1q_s16(s);
+ insert_new_s16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+ store_s16xnxm<n_store, 4>(&res, d, dstStride);
+
+ src += 8;
+ dst += 8;
+ }
+ const int n_store = width > 4 ? 4 : width;
+
+ int16x4_t in[4];
+ load_s16x4xn<4>(src, srcStride, in);
+ src += 4 * srcStride;
+
+ int16x8_t ss[2];
+ transpose_concat_s16_4x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x4_t res;
+ filter4_s16x4(ss, filter, offset, res);
+
+ store_s16xnxm<n_store, 1>(&res, dst, dstStride);
+
+ int16x8_t new_r = vld1q_s16(src);
+ insert_new_s16_elements_x4(ss, new_r, merge_block_tbl);
+
+ src += srcStride;
+ dst += dstStride;
+ }
+
+ int16x4_t res;
+ filter4_s16x4(ss, filter, offset, res);
+ store_s16xnxm<n_store, 1>(&res, dst, dstStride);
+ }
+ else
+ {
+ for (int col = 0; col < width; col += 8)
+ {
+ const int16_t *s = src;
+ int16_t *d = dst;
+
+ int16x8_t in[4];
+ load_s16x8xn<4>(s, srcStride, in);
+ s += 4 * srcStride;
+
+ int16x8_t ss[4];
+ transpose_concat_s16_8x4(in, ss);
+
+ for (int row = 0; row < height - 1; ++row)
+ {
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+
+ vst1q_s16(d, res);
+
+ int16x8_t new_r = vld1q_s16(s);
+ insert_new_s16_elements_x8(ss, new_r, merge_block_tbl);
+
+ s += srcStride;
+ d += dstStride;
+ }
+
+ int16x8_t res;
+ filter4_s16x8(ss, filter, offset, res);
+ vst1q_s16(d, res);
+
+ src += 8;
+ dst += 8;
+ }
+ }
+}
+
namespace X265_NS {
// Declaration for use in interp8_horiz_pp_sve().
template<int N, int width, int height>
@@ -362,6 +600,26 @@ void interp8_horiz_ps_sve(const pixel *src, intptr_t srcStride, int16_t *dst,
}
}
+// Declaration for use in interp4_vert_ss_sve().
+template<int N, int width, int height>
+void interp_vert_ss_neon(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx);
+
+template<int width, int height>
+void interp4_vert_ss_sve(const int16_t *src, intptr_t srcStride, int16_t *dst,
+ intptr_t dstStride, int coeffIdx)
+{
+ switch (coeffIdx)
+ {
+ case 4:
+ return interp_vert_ss_neon<4, width, height>(src, srcStride, dst, dstStride,
+ coeffIdx);
+ default:
+ return interp4_vss_sve<width, height>(src, srcStride, dst, dstStride,
+ coeffIdx);
+ }
+}
+
void setupFilterPrimitives_sve(EncoderPrimitives &p)
{
p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>;
@@ -390,6 +648,56 @@ void setupFilterPrimitives_sve(EncoderPrimitives &p)
p.pu[LUMA_4x4].luma_hps = interp8_horiz_ps_sve<4, 4>;
p.pu[LUMA_4x8].luma_hps = interp8_horiz_ps_sve<4, 8>;
p.pu[LUMA_4x16].luma_hps = interp8_horiz_ps_sve<4, 16>;
+
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vss = interp4_vert_ss_sve<12, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = interp4_vert_ss_sve<16, 4>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = interp4_vert_ss_sve<16, 12>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = interp4_vert_ss_sve<24, 32>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = interp4_vert_ss_sve<32, 8>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = interp4_vert_ss_sve<32, 24>;
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vss = interp4_vert_ss_sve<8, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = interp4_vert_ss_sve<12, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = interp4_vert_ss_sve<16, 24>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = interp4_vert_ss_sve<16, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vss = interp4_vert_ss_sve<24, 64>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = interp4_vert_ss_sve<32, 48>;
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = interp4_vert_ss_sve<32, 64>;
+
+ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vss = interp4_vert_ss_sve<8, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = interp4_vert_ss_sve<8, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_12x16].filter_vss = interp4_vert_ss_sve<12, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = interp4_vert_ss_sve<16, 4>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = interp4_vert_ss_sve<16, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = interp4_vert_ss_sve<16, 12>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = interp4_vert_ss_sve<16, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = interp4_vert_ss_sve<16, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = interp4_vert_ss_sve<16, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = interp4_vert_ss_sve<24, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = interp4_vert_ss_sve<32, 8>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = interp4_vert_ss_sve<32, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = interp4_vert_ss_sve<32, 24>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = interp4_vert_ss_sve<32, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = interp4_vert_ss_sve<32, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = interp4_vert_ss_sve<48, 64>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = interp4_vert_ss_sve<64, 16>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = interp4_vert_ss_sve<64, 32>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = interp4_vert_ss_sve<64, 48>;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = interp4_vert_ss_sve<64, 64>;
}
} // namespace X265_NS
#else // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list