[x265] [PATCH v2 5/7] AArch64: Add Neon implementation of 8x8 IDCT

Fri Dec 6 04:54:33 UTC 2024



At 2024-12-04 23:39:00, "Micro Daryl Robles" <microdaryl.robles at arm.com> wrote:
>Also add a new helper function transpose_4x8_s16.

>+static inline void transpose_4x8_s16(int16x4_t s0, int16x4_t s1, int16x4_t s2, int16x4_t s3,
>+                                     int16x4_t s4, int16x4_t s5, int16x4_t s6, int16x4_t s7,
>+                                     int16x8_t &d0, int16x8_t &d1, int16x8_t &d2, int16x8_t &d3)
>+{
>+    int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0));
>+    int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0));
>+    int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0));
>+    int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0));
>+    int16x8_t s4q = vcombine_s16(s4, vdup_n_s16(0));
>+    int16x8_t s5q = vcombine_s16(s5, vdup_n_s16(0));
>+    int16x8_t s6q = vcombine_s16(s6, vdup_n_s16(0));

>+    int16x8_t s7q = vcombine_s16(s7, vdup_n_s16(0));
Same as previous, high 64 bits unnecessary to clear


>+template<int shift>
>+static inline void partialButterflyInverse8_neon(const int16_t *src, int16_t *dst, intptr_t dstStride)
>+    if (vget_lane_u64(vreinterpret_u64_s16(vget_low_s16(s3)), 0) != 0)

detect zeros is good idea, however, 4 instructions not enough to hidden pipeline flush cost, suggest combine below each two of if_sections (O_lo & O_hi) into one


>+    {
>+        O_lo[0] = vmlal_lane_s16(O_lo[0], vget_low_s16(s3), c_odd, 1); //  75
>+        O_lo[1] = vmlsl_lane_s16(O_lo[1], vget_low_s16(s3), c_odd, 3); // -18
>+        O_lo[2] = vmlsl_lane_s16(O_lo[2], vget_low_s16(s3), c_odd, 0); // -89
>+        O_lo[3] = vmlsl_lane_s16(O_lo[3], vget_low_s16(s3), c_odd, 2); // -50
>+    }
>+    if (vget_lane_u64(vreinterpret_u64_s16(vget_high_s16(s3)), 0) != 0)
>+    {
>+        O_hi[0] = vmlal_lane_s16(O_hi[0], vget_high_s16(s3), c_odd, 1); //  75
>+        O_hi[1] = vmlsl_lane_s16(O_hi[1], vget_high_s16(s3), c_odd, 3); // -18
>+        O_hi[2] = vmlsl_lane_s16(O_hi[2], vget_high_s16(s3), c_odd, 0); // -89
>+        O_hi[3] = vmlsl_lane_s16(O_hi[3], vget_high_s16(s3), c_odd, 2); // -50
>+    }


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20241206/b18f286f/attachment.htm>