[x265] [PATCH 12/18] AArch64: Refactor types and conversions in intrapred-prim.cpp

Tue Aug 13 15:20:59 UTC 2024

Use correct types for all variables/operations and make all vector
conversions explicit in common/aarch64/intrapred-prim.cpp.

Also refactor some instructions to use widening/narrowing variants, to
save separate widening/narrowing instructions.
---
 source/common/aarch64/intrapred-prim.cpp | 34 +++++++++++++-----------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp
index beadd865f..9bf50c4aa 100644
--- a/source/common/aarch64/intrapred-prim.cpp
+++ b/source/common/aarch64/intrapred-prim.cpp
@@ -110,16 +110,17 @@ void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, i
                     const uint8_t *ref_u8 = (const uint8_t *)ref + offset;
                     uint8_t *dst_u8 = (uint8_t *)dst;
 
-                    const int16x8_t f0 = vdupq_n_s16(32 - fraction);
-                    const int16x8_t f1 = vdupq_n_s16(fraction);
+                    // f0 and f1 are unsigned (fraction is in range [0, 31]).
+                    const uint8x8_t f0 = vdup_n_u8(32 - fraction);
+                    const uint8x8_t f1 = vdup_n_u8(fraction);
                     for (int x = 0; x < width; x += 8)
                     {
-                        uint16x8_t in0 = vmovl_u8(vld1_u8(ref_u8 + x));
-                        uint16x8_t in1 = vmovl_u8(vld1_u8(ref_u8 + x + 1));
-                        int16x8_t lo = vmlaq_s16(vdupq_n_s16(16), in0, f0);
-                        lo = vmlaq_s16(lo, in1, f1);
-                        lo = vshrq_n_s16(lo, 5);
-                        vst1_u8(dst_u8 + y * dstStride + x, vmovn_u16(lo));
+                        uint8x8_t in0 = vld1_u8(ref_u8 + x);
+                        uint8x8_t in1 = vld1_u8(ref_u8 + x + 1);
+                        uint16x8_t lo = vmlal_u8(vdupq_n_u16(16), in0, f0);
+                        lo = vmlal_u8(lo, in1, f1);
+                        uint8x8_t res = vshrn_n_u16(lo, 5);
+                        vst1_u8(dst_u8 + y * dstStride + x, res);
                     }
                 }
                 else if (width >= 4 && sizeof(pixel) == 2)
@@ -129,16 +130,17 @@ void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, i
                     const uint16_t *ref_u16 = (const uint16_t *)ref + offset;
                     uint16_t *dst_u16 = (uint16_t *)dst;
 
-                    const int32x4_t f0 = vdupq_n_s32(32 - fraction);
-                    const int32x4_t f1 = vdupq_n_s32(fraction);
+                    // f0 and f1 are unsigned (fraction is in range [0, 31]).
+                    const uint16x4_t f0 = vdup_n_u16(32 - fraction);
+                    const uint16x4_t f1 = vdup_n_u16(fraction);
                     for (int x = 0; x < width; x += 4)
                     {
-                        uint32x4_t in0 = vmovl_u16(vld1_u16(ref_u16 + x));
-                        uint32x4_t in1 = vmovl_u16(vld1_u16(ref_u16 + x + 1));
-                        int32x4_t lo = vmlaq_s32(vdupq_n_s32(16), in0, f0);
-                        lo = vmlaq_s32(lo, in1, f1);
-                        lo = vshrq_n_s32(lo, 5);
-                        vst1_u16(dst_u16 + y * dstStride + x, vmovn_u32(lo));
+                        uint16x4_t in0 = vld1_u16(ref_u16 + x);
+                        uint16x4_t in1 = vld1_u16(ref_u16 + x + 1);
+                        uint32x4_t lo = vmlal_u16(vdupq_n_u32(16), in0, f0);
+                        lo = vmlal_u16(lo, in1, f1);
+                        uint16x4_t res = vshrn_n_u32(lo, 5);
+                        vst1_u16(dst_u16 + y * dstStride + x, res);
                     }
                 }
                 else
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0012-AArch64-Refactor-types-and-conversions-in-intrapred-.patch
Type: text/x-patch
Size: 4151 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/aec3c7be/attachment.bin>