[x265] [PATCH 14/18] AArch64: Use proper load/store intrinsics in loopfilter-prim.cpp

Tue Aug 13 15:21:15 UTC 2024

Use proper load/store intrinsics instead of casts in
source/common/aarch64/loopfilter-prim.cpp.
---
 source/common/aarch64/loopfilter-prim.cpp | 79 +++++++++++++----------
 1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp
index 581968b06..5e912d76a 100644
--- a/source/common/aarch64/loopfilter-prim.cpp
+++ b/source/common/aarch64/loopfilter-prim.cpp
@@ -23,7 +23,8 @@ static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, cons
     int x = 0;
     for (; (x + 8) <= endX; x += 8)
     {
-        *(int8x8_t *)&dst[x]  = sign_diff_neon(*(uint8x8_t *)&src1[x], *(uint8x8_t *)&src2[x]);
+        int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
+        vst1_s8(dst + x, sign);
     }
 
     for (; x < endX; x++)
@@ -51,11 +52,11 @@ static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t
             int8x8x2_t shifter;
             shifter.val[1][0] = signLeft0;
             static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in = *(uint8x8_t *)&rec[x];
-                vsignRight = sign_diff_neon(in, *(uint8x8_t *)&rec[x + 1]);
+                uint8x8_t in = vld1_u8(rec + x);
+                vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
                 shifter.val[0] = vneg_s8(vsignRight);
                 int8x8_t tmp = shifter.val[0];
                 int8x8_t edge = vtbl2_s8(shifter, index);
@@ -63,7 +64,7 @@ static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t
                 shifter.val[1][0] = tmp[7];
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
                 t1 = vaddw_u8(t1, in);
-                *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
             signLeft0 = shifter.val[1][0];
         }
@@ -86,17 +87,20 @@ static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, i
 
     if (width >= 8)
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&rec[x];
-            uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
-            *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
             t1 = vaddw_u8(t1, in0);
-            *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
     }
     for (; x < width; x++)
@@ -119,18 +123,20 @@ static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offse
         int x = 0;
         if (width >= 8)
         {
-            int8x8_t tbl = *(int8x8_t *)offsetEo;
+            int8x8_t tbl = vld1_s8(offsetEo);
+            const int8x8_t c = vdup_n_s8(2);
+
             for (; (x + 8) <= width; x += 8)
             {
-                uint8x8_t in0 = *(uint8x8_t *)&rec[x];
-                uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+                uint8x8_t in0 = vld1_u8(rec + x);
+                uint8x8_t in1 = vld1_u8(rec + x + stride);
                 int8x8_t vsignDown = sign_diff_neon(in0, in1);
-                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
-                *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
+                int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+                int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+                vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
                 int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
                 t1 = vaddw_u8(t1, in0);
-                *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
-
+                vst1_u8(rec + x, vqmovun_s16(t1));
             }
         }
         for (; x < width; x++)
@@ -160,18 +166,21 @@ static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t
     }
     else
     {
-        int8x8_t tbl = *(int8x8_t *)offsetEo;
+        int8x8_t tbl = vld1_s8(offsetEo);
+        const int8x8_t c = vdup_n_s8(2);
+
         x = 0;
         for (; (x + 8) <= width; x += 8)
         {
-            uint8x8_t in0 = *(uint8x8_t *)&rec[x];
-            uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride + 1];
+            uint8x8_t in0 = vld1_u8(rec + x);
+            uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
             int8x8_t vsignDown = sign_diff_neon(in0, in1);
-            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1[x]), vdup_n_s8(2));
-            *(int8x8_t *)&bufft[x + 1] = vneg_s8(vsignDown);
+            int8x8_t vsignUp = vld1_s8(buff1 + x);
+            int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+            vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
             int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
             t1 = vaddw_u8(t1, in0);
-            *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+            vst1_u8(rec + x, vqmovun_s16(t1));
         }
         for (; x < width; x++)
         {
@@ -189,20 +198,21 @@ static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, i
 {
     int8_t signDown;
     int8_t edgeType;
-    int8x8_t tbl = *(int8x8_t *)offsetEo;
+    int8x8_t tbl = vld1_s8(offsetEo);
+    const int8x8_t c = vdup_n_s8(2);
 
     int x = startX + 1;
     for (; (x + 8) <= endX; x += 8)
     {
-        uint8x8_t in0 = *(uint8x8_t *)&rec[x];
-        uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+        uint8x8_t in0 = vld1_u8(rec + x);
+        uint8x8_t in1 = vld1_u8(rec + x + stride);
         int8x8_t vsignDown = sign_diff_neon(in0, in1);
-        int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
-        *(int8x8_t *)&upBuff1[x - 1] = vneg_s8(vsignDown);
+        int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+        int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+        vst1_s8(upBuff1 + x - 1, vneg_s8(vsignDown));
         int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
         t1 = vaddw_u8(t1, in0);
-        *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
-
+        vst1_u8(rec + x, vqmovun_s16(t1));
     }
     for (; x < endX; x++)
     {
@@ -218,19 +228,18 @@ static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth,
 #define SAO_BO_BITS 5
     const int boShift = X265_DEPTH - SAO_BO_BITS;
     int x, y;
-    int8x8x4_t table;
-    table = *(int8x8x4_t *)offset;
+    int8x8x4_t table = vld1_s8_x4(offset);
 
     for (y = 0; y < ctuHeight; y++)
     {
 
         for (x = 0; (x + 8) <= ctuWidth; x += 8)
         {
-            int8x8_t in = *(int8x8_t *)&rec[x];
+            int8x8_t in = vld1_u8(rec + x);
             int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
             int16x8_t tmp = vmovl_s8(offsets);
             tmp = vaddw_u8(tmp, in);
-            *(uint8x8_t *)&rec[x] = vqmovun_s16(tmp);
+            vst1_u8(rec + x, vqmovun_s16(tmp));
         }
         for (; x < ctuWidth; x++)
         {
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0014-AArch64-Use-proper-load-store-intrinsics-in-loopfilt.patch
Type: text/x-patch
Size: 8257 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/4efe8f1c/attachment-0001.bin>