[x265] [PATCH 13/18] AArch64: Refactor narrowing in loopfilter-prim.cpp
Hari Limaye
hari.limaye at arm.com
Tue Aug 13 15:21:07 UTC 2024
Use SQXTUN instructions for saturating and narrowing arithmetic, instead
of additional SMIN, SMAX sequences, where applicable in
common/aarch64/loopfilter-prim.cpp.
---
source/common/aarch64/loopfilter-prim.cpp | 20 +++++---------------
1 file changed, 5 insertions(+), 15 deletions(-)
diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp
index 37679c0b6..581968b06 100644
--- a/source/common/aarch64/loopfilter-prim.cpp
+++ b/source/common/aarch64/loopfilter-prim.cpp
@@ -63,9 +63,7 @@ static void processSaoCUE0_neon(pixel *rec, int8_t *offsetEo, int width, int8_t
shifter.val[1][0] = tmp[7];
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in);
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
- t1 = vminq_s16(t1, vdupq_n_s16(255));
- *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
+ *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
}
signLeft0 = shifter.val[1][0];
}
@@ -131,9 +129,7 @@ static void processSaoCUE1_2Rows_neon(pixel *rec, int8_t *upBuff1, int8_t *offse
*(int8x8_t *)&upBuff1[x] = vneg_s8(vsignDown);
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
- t1 = vminq_s16(t1, vdupq_n_s16(255));
- *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
+ *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
}
}
@@ -175,9 +171,7 @@ static void processSaoCUE2_neon(pixel *rec, int8_t *bufft, int8_t *buff1, int8_t
*(int8x8_t *)&bufft[x + 1] = vneg_s8(vsignDown);
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
- t1 = vminq_s16(t1, vdupq_n_s16(255));
- *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
+ *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
}
for (; x < width; x++)
{
@@ -207,9 +201,7 @@ static void processSaoCUE3_neon(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, i
*(int8x8_t *)&upBuff1[x - 1] = vneg_s8(vsignDown);
int16x8_t t1 = vmovl_s8(vtbl1_s8(tbl, vedgeType));
t1 = vaddw_u8(t1, in0);
- t1 = vmaxq_s16(t1, vdupq_n_s16(0));
- t1 = vminq_s16(t1, vdupq_n_s16(255));
- *(uint8x8_t *)&rec[x] = vmovn_u16(t1);
+ *(uint8x8_t *)&rec[x] = vqmovun_s16(t1);
}
for (; x < endX; x++)
@@ -238,9 +230,7 @@ static void processSaoCUB0_neon(pixel *rec, const int8_t *offset, int ctuWidth,
int8x8_t offsets = vtbl4_s8(table, vshr_n_u8(in, boShift));
int16x8_t tmp = vmovl_s8(offsets);
tmp = vaddw_u8(tmp, in);
- tmp = vmaxq_s16(tmp, vdupq_n_s16(0));
- tmp = vminq_s16(tmp, vdupq_n_s16(255));
- *(uint8x8_t *)&rec[x] = vmovn_u16(tmp);
+ *(uint8x8_t *)&rec[x] = vqmovun_s16(tmp);
}
for (; x < ctuWidth; x++)
{
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0013-AArch64-Refactor-narrowing-in-loopfilter-prim.cpp.patch
Type: text/x-patch
Size: 3555 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240813/32e6c7bd/attachment.bin>
More information about the x265-devel
mailing list