[x265] [PATCH 14/18] AArch64: Use proper load/store intrinsics in loopfilter-prim.cpp
Hari Limaye
hari.limaye at arm.com
Tue Aug 13 15:21:15 UTC 2024
Use proper load/store intrinsics instead of casts in
source/common/aarch64/loopfilter-prim.cpp.
---
source/common/aarch64/loopfilter-prim.cpp | 79 +++++++++++++----------
1 file changed, 44 insertions(+), 35 deletions(-)
diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp
index 581968b06..5e912d76a 100644
--- a/source/common/aarch64/loopfilter-prim.cpp
+++ b/source/common/aarch64/loopfilter-prim.cpp
@@ -23,7 +23,8 @@ static void calSign_neon(int8_t *dst, const pixel *src1, const pixel *src2, cons
int x = 0;
for (; (x + 8) <= endX; x += 8)
{
- *(int8x8_t *)&dst[x] = sign_diff_neon(*(uint8x8_t *)&src1[x], *(uint8x8_t *)&src2[x]);
+ int8x8_t sign = sign_diff_neon(vld1_u8(src1 + x), vld1_u8(src2 + x));
+ vst1_s8(dst + x, sign);
}
for (; x < endX; x++)
@@ -51,11 +52,11 @@ static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t
int8x8x2_t shifter;
shifter.val[1][0] = signLeft0;
static const int8x8_t index = {8, 0, 1, 2, 3, 4, 5, 6};
- int8x8_t tbl = *(int8x8_t *)offsetEo;
+ int8x8_t tbl = vld1_s8(offsetEo);
for (; (x + 8) <= width; x += 8)
{
- uint8x8_t in = *(uint8x8_t *)&rec[x];
- vsignRight = sign_diff_neon(in, *(uint8x8_t *)&rec[x + 1]);
+ uint8x8_t in = vld1_u8(rec + x);
+ vsignRight = sign_diff_neon(in, vld1_u8(rec + x + 1));
shifter.val[0] = vneg_s8(vsignRight);
int8x8_t tmp = shifter.val[0];
int8x8_t edge = vtbl2_s8(shifter, index);
@@ -63,7 +64,7 @@ static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t
shifter.val[1][0] = tmp[7];
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+ vst1_u8(rec + x, vqmovun_s16(t1));
}
signLeft0 = shifter.val[1][0];
}
@@ -86,17 +87,20 @@ static void processSaoCUE1_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, i
if (width >= 8)
{
- int8x8_t tbl = *(int8x8_t *)offsetEo;
+ int8x8_t tbl = vld1_s8(offsetEo);
+ const int8x8_t c = vdup_n_s8(2);
+
for (; (x + 8) <= width; x += 8)
{
- uint8x8_t in0 = *(uint8x8_t *)&rec[x];
- uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+ uint8x8_t in0 = vld1_u8(rec + x);
+ uint8x8_t in1 = vld1_u8(rec + x + stride);
int8x8_t vsignDown = sign_diff_neon(in0, in1);
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
- *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
+ int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+ vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+ vst1_u8(rec + x, vqmovun_s16(t1));
}
}
for (; x < width; x++)
@@ -119,18 +123,20 @@ static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offse
int x = 0;
if (width >= 8)
{
- int8x8_t tbl = *(int8x8_t *)offsetEo;
+ int8x8_t tbl = vld1_s8(offsetEo);
+ const int8x8_t c = vdup_n_s8(2);
+
for (; (x + 8) <= width; x += 8)
{
- uint8x8_t in0 = *(uint8x8_t *)&rec[x];
- uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+ uint8x8_t in0 = vld1_u8(rec + x);
+ uint8x8_t in1 = vld1_u8(rec + x + stride);
int8x8_t vsignDown = sign_diff_neon(in0, in1);
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
- *(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
+ int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+ vst1_s8(upBuff1 + x, vneg_s8(vsignDown));
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
-
+ vst1_u8(rec + x, vqmovun_s16(t1));
}
}
for (; x < width; x++)
@@ -160,18 +166,21 @@ static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t
}
else
{
- int8x8_t tbl = *(int8x8_t *)offsetEo;
+ int8x8_t tbl = vld1_s8(offsetEo);
+ const int8x8_t c = vdup_n_s8(2);
+
x = 0;
for (; (x + 8) <= width; x += 8)
{
- uint8x8_t in0 = *(uint8x8_t *)&rec[x];
- uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride + 1];
+ uint8x8_t in0 = vld1_u8(rec + x);
+ uint8x8_t in1 = vld1_u8(rec + x + stride + 1);
int8x8_t vsignDown = sign_diff_neon(in0, in1);
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&buff1[x]), vdup_n_s8(2));
- *(int8x8_t *)&bufft[x + 1] = vneg_s8(vsignDown);
+ int8x8_t vsignUp = vld1_s8(buff1 + x);
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+ vst1_s8(bufft + x + 1, vneg_s8(vsignDown));
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
+ vst1_u8(rec + x, vqmovun_s16(t1));
}
for (; x < width; x++)
{
@@ -189,20 +198,21 @@ static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, i
{
int8_t signDown;
int8_t edgeType;
- int8x8_t tbl = *(int8x8_t *)offsetEo;
+ int8x8_t tbl = vld1_s8(offsetEo);
+ const int8x8_t c = vdup_n_s8(2);
int x = startX + 1;
for (; (x + 8) <= endX; x += 8)
{
- uint8x8_t in0 = *(uint8x8_t *)&rec[x];
- uint8x8_t in1 = *(uint8x8_t *)&rec[x + stride];
+ uint8x8_t in0 = vld1_u8(rec + x);
+ uint8x8_t in1 = vld1_u8(rec + x + stride);
int8x8_t vsignDown = sign_diff_neon(in0, in1);
- int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, *(int8x8_t *)&upBuff1[x]), vdup_n_s8(2));
- *(int8x8_t *)&upBuff1[x - 1] = vneg_s8(vsignDown);
+ int8x8_t vsignUp = vld1_s8(upBuff1 + x);
+ int8x8_t vedgeType = vadd_s8(vadd_s8(vsignDown, vsignUp), c);
+ vst1_s8(upBuff1 + x - 1, vneg_s8(vsignDown));
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
-
+ vst1_u8(rec + x, vqmovun_s16(t1));
}
for (; x < endX; x++)
{
@@ -218,19 +228,18 @@ static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth,
#define SAO_BO_BITS 5
const int boShift = X265_DEPTH - SAO_BO_BITS;
int x, y;
- int8x8x4_t table;
- table = *(int8x8x4_t *)offset;
+ int8x8x4_t table = vld1_s8_x4(offset);
for (y = 0; y < ctuHeight; y++)
{
for (x = 0; (x + 8) <= ctuWidth; x += 8)
{
- int8x8_t in = *(int8x8_t *)&rec[x];
+ int8x8_t in = vld1_u8(rec + x);
int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
int16x8_t tmp = vmovl_s8(offsets);
tmp = vaddw_u8(tmp, in);
- *(uint8x8_t *)&rec[x] = vqmovun_s16(tmp);
+ vst1_u8(rec + x, vqmovun_s16(tmp));
}
for (; x < ctuWidth; x++)
{
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0014-AArch64-Use-proper-load-store-intrinsics-in-loopfilt.patch
Type: text/x-patch
Size: 8257 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/4efe8f1c/attachment-0001.bin>
More information about the x265-devel
mailing list