<div data-ntes="ntes_mail_body_root" style="line-height:1.7;color:#000000;font-size:14px;font-family:Arial"><div id="spnEditorContent"><p style="margin: 0;"><span style="font-family: Courier;">Thank for the patches.</span></p><p style="margin: 0;"><br></p><p style="margin: 0;"><span style="font-family: Courier;">The algorithm kernel in my understand below, there have two </span><span style="font-family: Courier;">vcreate_u8, it may merge into constant array c[2] to avoid compiler performance issue, other part looks good to me.</span></p><p style="margin: 0;"><br></p><p style="margin: 0;"><span style="font-family: Courier;">+ uint16x8_t t = vmull_u8(topRight, vget_high_u8(c0)); ; topRight *[4 3 2 1 4 3 2 1]</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ t = vmlal_u8(t, above, vget_low_u8(c1)); ; +above *[1 1 1 1 3 3 3 3]</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ t = vmlal_u8(t, bottomLeft, vget_high_u8(c1)); ; +bottomLeft *[3 3 3 3 1 1 1 1]</span></p><p style="margin: 0;"><span style="font-family: Courier;">+</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ uint8x8_t index02 = vcreate_u8(0x0A0A0A0A08080808);</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ uint8x8_t left02 = vqtbl1_u8(src, index02); ; left02 = [L2 L2 L2 L2 L0 L0 L0 L0]</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ uint16x8_t t02 = vmlal_u8(t, left02, vget_low_u8(c0)); ; t02 = [ L2*[0 1 2 3] L0*[0 1 2 3] ]</span></p><p style="margin: 0;"><span style="font-family: Courier;">+ uint8x8_t d02 = vrshrn_n_u16(t02, log2Size + 1);</span></p><div><br></div><p style="margin: 0;"><br></p></div><pre>At 2025-01-23 22:15:30, "Micro Daryl Robles" <microdaryl.robles@arm.com> wrote:
>Relative performance compared to scalar C:
>
> Neoverse N1: 2.65x
> Neoverse N2: 2.27x
> Neoverse V1: 2.67x
> Neoverse V2: 2.82x
>---
> source/common/aarch64/intrapred-prim.cpp | 47 ++++++++++++++++++++++++
> 1 file changed, 47 insertions(+)
>
>diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp
>index 68a9f26ac..c5d47fe0d 100644
>--- a/source/common/aarch64/intrapred-prim.cpp
>+++ b/source/common/aarch64/intrapred-prim.cpp
>@@ -4,6 +4,7 @@
>
> #if HAVE_NEON
> #include "arm64-utils.h"
>+#include "mem-neon.h"
> #include <arm_neon.h>
>
> using namespace X265_NS;
>@@ -399,6 +400,51 @@ void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int
> }
> }
>
>+#if !HIGH_BIT_DEPTH
>+void intra_pred_planar4_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix,
>+ int /*dirMode*/, int /*bFilter*/)
>+{
>+ const int log2Size = 2;
>+ const int blkSize = 1 << log2Size;
>+
>+ uint8x16_t src = vld1q_u8(srcPix + 1);
>+
>+ uint8x8_t above =
>+ vreinterpret_u8_u32(vdup_laneq_u32(vreinterpretq_u32_u8(src), 0));
>+
>+ uint8x8_t topRight = vdup_laneq_u8(src, blkSize);
>+ uint8x8_t bottomLeft = vdup_laneq_u8(src, 3 * blkSize);
>+
>+ const uint8_t c[2][16] =
>+ {
>+ {3, 2, 1, 0, 3, 2, 1, 0, 1, 2, 3, 4, 1, 2, 3, 4},
>+ {3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3}
>+ };
>+
>+ const uint8x16_t c0 = vld1q_u8(c[0]);
>+ const uint8x16_t c1 = vld1q_u8(c[1]);
>+
>+ uint16x8_t t = vmull_u8(topRight, vget_high_u8(c0));
>+ t = vmlal_u8(t, above, vget_low_u8(c1));
>+ t = vmlal_u8(t, bottomLeft, vget_high_u8(c1));
>+
>+ uint8x8_t index02 = vcreate_u8(0x0A0A0A0A08080808);
>+ uint8x8_t left02 = vqtbl1_u8(src, index02);
>+ uint16x8_t t02 = vmlal_u8(t, left02, vget_low_u8(c0));
>+ uint8x8_t d02 = vrshrn_n_u16(t02, log2Size + 1);
>+
>+ uint8x8_t index13 = vcreate_u8(0x0B0B0B0B09090909);
>+ uint8x8_t left13 = vqtbl1_u8(src, index13);
>+ uint16x8_t t13 = vmlal_u8(t, left13, vget_low_u8(c0));
>+ uint16x8_t sub_bottomLeft_above = vsubl_u8(bottomLeft, above);
>+ t13 = vaddq_u16(t13, sub_bottomLeft_above);
>+ uint8x8_t d13 = vrshrn_n_u16(t13, log2Size + 1);
>+
>+ store_u8x4_strided_xN<2>(dst + 0 * dstStride, 2 * dstStride, &d02);
>+ store_u8x4_strided_xN<2>(dst + 1 * dstStride, 2 * dstStride, &d13);
>+}
>+#endif
>+
> static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
> {
> // boundary pixels processing
>@@ -576,6 +622,7 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p)
> p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
>
> #if !HIGH_BIT_DEPTH
>+ p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = intra_pred_planar4_neon;
> p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon);
> p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_neon);
> #endif
>--
>2.34.1
>
</pre></div>