[x265] [PATCH 6/6] pixel-util-sve2.S: Fix normFact/ssimDist64 for longer SVE vectors
George Steed
george.steed at arm.com
Mon Jan 6 17:18:01 UTC 2025
The existing code for vectors of at least 1024-bits uses predication to
ensure that only the low 1024 bits of the vector are operated on,
however the address arithmetic is performed relative to the vector
length so for 2048-bit vectors this is incorrect. Since we are operating
on the fixed low 1024 bits of the vectors here, hard-code the address
offset to 32 bytes to fix.
---
source/common/aarch64/pixel-util-sve2.S | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 5b872f437..c7ff0b35e 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -1339,12 +1339,13 @@ function PFX(ssimDist64_sve2)
.vl_gt_112_ssimDist64:
ssimDist_start_sve2
ptrue p0.s, vl32
+ mov x5, #32
.vl_gt_112_loop_ssimDist64_sve2:
sub w12, w12, #1
ld1b {z2.s}, p0/z, [x0]
- ld1b {z3.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z3.s}, p0/z, [x0, x5]
ld1b {z23.s}, p0/z, [x2]
- ld1b {z24.s}, p0/z, [x2, #1, mul vl]
+ ld1b {z24.s}, p0/z, [x2, x5]
ssimDist_1_sve2 z2, z3, z23, z24
add x0, x0, x1
add x2, x2, x3
@@ -1596,10 +1597,11 @@ function PFX(normFact64_sve2)
.vl_gt_112_normFact64:
normFact_start_sve2
ptrue p0.s, vl32
+ mov x4, #32
.vl_gt_112_loop_normFact64_sve2:
sub w12, w12, #1
ld1b {z4.s}, p0/z, [x0]
- ld1b {z5.s}, p0/z, [x0, #1, mul vl]
+ ld1b {z5.s}, p0/z, [x0, x4]
normFact_1_sve2 z4, z5
add x0, x0, x1
cbnz w12, .vl_gt_112_loop_normFact64_sve2
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0006-pixel-util-sve2.S-Fix-normFact-ssimDist64-for-longer.patch
Type: text/x-diff
Size: 2256 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250106/dc6dcae6/attachment.patch>
More information about the x265-devel
mailing list