[x265] [PATCH] p2s-sve.S: Fix filterPixelToShort_48x64 for longer SVE vectors
George Steed
george.steed at arm.com
Tue Mar 11 16:11:17 UTC 2025
The existing filterPixelToShort_48x64 high bit-depth code for vectors of
at least 256-bits uses predication to ensure that only the low 256 bits
of the vector are operated on, however the address arithmetic is
performed relative to the vector length so for vectors of 512-bits or
longer this is incorrect. Since we are operating on the fixed low 256
bits of the vectors here, fix the code by hard-coding the address offset
to multiples of 32 bytes.
---
source/common/aarch64/p2s-sve.S | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index 85bb14b3d..11e63ddab 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -401,10 +401,12 @@ function PFX(filterPixelToShort_48x64_sve)
ret
.vl_gt_16_filterPixelToShort_high_48x64:
ptrue p0.h, vl16
+ mov x4, #16
+ mov x5, #32
.rept 64
ld1h {z0.h}, p0/z, [x0]
- ld1h {z1.h}, p0/z, [x0, #1, mul vl]
- ld1h {z2.h}, p0/z, [x0, #2, mul vl]
+ ld1h {z1.h}, p0/z, [x0, x4, lsl #1]
+ ld1h {z2.h}, p0/z, [x0, x5, lsl #1]
add x0, x0, x1
lsl z0.h, p0/m, z0.h, #P2S_SHIFT
lsl z1.h, p0/m, z1.h, #P2S_SHIFT
@@ -413,8 +415,8 @@ function PFX(filterPixelToShort_48x64_sve)
add z1.h, p0/m, z1.h, z31.h
add z2.h, p0/m, z2.h, z31.h
st1h {z0.h}, p0, [x2]
- st1h {z1.h}, p0, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x2, #2, mul vl]
+ st1h {z1.h}, p0, [x2, x4, lsl #1]
+ st1h {z2.h}, p0, [x2, x5, lsl #1]
add x2, x2, x3
.endr
ret
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-p2s-sve.S-Fix-filterPixelToShort_48x64-for-longer-SV.patch
Type: text/x-diff
Size: 2021 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250311/aab32156/attachment.patch>
More information about the x265-devel
mailing list