[x265] [PATCH 1/6] mc-a-sve2.S: Fix addAvg_{16, 32}xh_sve2 for longer SVE vectors
George Steed
george.steed at arm.com
Mon Jan 6 17:16:26 UTC 2025
The 256-bit SVE vector length loop for 16xh already only processes one
full vector per iteration, so a further specialization for even longer
vectors cannot be useful. In particular the implementation here for
vector lengths of at least 512 bits appears to be incorrect and
unreachable, so simply delete it and use the 256-bit implmentation
instead.
The existing code for 32xh flips the add and sqrshrnb instructions
leading to an incorrect result, so reorder them to fix.
Co-authored-by: Hari Limaye <hari.limaye at arm.com>
---
source/common/aarch64/mc-a-sve2.S | 24 +++---------------------
1 file changed, 3 insertions(+), 21 deletions(-)
diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index e4540ce9b..00fb0048f 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2)
cbnz w12, .Loop_eq_16_sve2_addavg_16x\h
ret
.vl_gt_16_addAvg_16x\h\():
- cmp x9, #32
- bgt .vl_gt_32_addAvg_16x\h
ptrue p0.b, vl32
.Loop_gt_16_sve2_addavg_16x\h\():
sub w12, w12, #1
@@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2)
add z0.h, p0/m, z0.h, z1.h
sqrshrnb z0.b, z0.h, #7
add z0.b, z0.b, #0x80
- st1b {z0.h}, p1, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_16_sve2_addavg_16x\h
- ret
-.vl_gt_32_addAvg_16x\h\():
- mov x10, #48
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_gt_32_sve2_addavg_16x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
st1b {z0.h}, p0, [x2]
add x2, x2, x5
- cbnz w12, .Loop_gt_32_sve2_addavg_16x\h
+ cbnz w12, .Loop_gt_16_sve2_addavg_16x\h
ret
endfunc
.endm
@@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2)
add z0.h, p0/m, z0.h, z2.h
add z1.h, p0/m, z1.h, z3.h
sqrshrnb z0.b, z0.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z1.b, z1.h, #7
add z0.b, z0.b, #0x80
+ sqrshrnb z1.b, z1.h, #7
+ add z1.b, z1.b, #0x80
st1b {z0.h}, p0, [x2]
st1b {z1.h}, p0, [x2, #1, mul vl]
add x2, x2, x5
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-mc-a-sve2.S-Fix-addAvg_-16-32-xh_sve2-for-longer-SVE.patch
Type: text/x-diff
Size: 3053 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250106/1129b2f2/attachment.patch>
More information about the x265-devel
mailing list