<div data-ntes="ntes_mail_body_root" style="line-height:1.7;color:#000000;font-size:14px;font-family:Arial"><div id="spnEditorContent"><p style="margin: 0;">Thank for the patch, my comments inline</p></div><pre>At 2024-12-04 23:37:53, "Micro Daryl Robles" <microdaryl.robles@arm.com> wrote:
>Also optimize transpose_4x4_s16 implementation.
>
>diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
>index 8b523ceb0..fcc86f6d6 100644
>--- a/source/common/aarch64/dct-prim.cpp
>+++ b/source/common/aarch64/dct-prim.cpp
>@@ -21,21 +21,23 @@ namespace
> {
> using namespace X265_NS;

>-static void transpose_4x4x16(int16x4_t &x0, int16x4_t &x1, int16x4_t &x2, int16x4_t &x3)
>+static inline void transpose_4x4_s16(int16x4_t &s0, int16x4_t &s1, int16x4_t &s2, int16x4_t &s3)
> {
>-    int32x2_t s0, s1, s2, s3;
>+    int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
>+    int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
>+    int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
<div>>+    int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));

</div><div><u>Why clear high 64-bits? it will overwrite by ZIP1 below</u></div><div><br></div>
>+    int16x8x2_t s0123 = vzipq_s16(s02, s13);
<div><br></div><div><br></div><div>>+void dst4_neon(const int16_t *src, int16_t *dst, intptr_t srcStride)</div>>+{
>+    const int shift_pass1 = 1 + X265_DEPTH - 8;
>+    const int shift_pass2 = 8;
>+
>+    ALIGN_VAR_32(int16_t, coef[4 * 4]);
>+    ALIGN_VAR_32(int16_t, block[4 * 4]);
>+
>+    for (int i = 0; i < 4; i++)
>+    {
>+        memcpy(&block[i * 4], &src[i * srcStride], 4 * sizeof(int16_t));
>+    }
<div><u>We need not this loop to copy data from input buffer</u></div><div><br></div></pre></div>