[x265] [PATCH 1/6] mc-a-sve2.S: Fix addAvg_{16, 32}xh_sve2 for longer SVE vectors

Mon Jan 6 17:16:26 UTC 2025

The 256-bit SVE vector length loop for 16xh already only processes one
full vector per iteration, so a further specialization for even longer
vectors cannot be useful. In particular the implementation here for
vector lengths of at least 512 bits appears to be incorrect and
unreachable, so simply delete it and use the 256-bit implmentation
instead.

The existing code for 32xh flips the add and sqrshrnb instructions
leading to an incorrect result, so reorder them to fix.

Co-authored-by: Hari Limaye <hari.limaye at arm.com>
---
 source/common/aarch64/mc-a-sve2.S | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index e4540ce9b..00fb0048f 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -511,8 +511,6 @@ function PFX(addAvg_16x\h\()_sve2)
     cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
-    cmp             x9, #32
-    bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
 .Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
@@ -523,25 +521,9 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z1.h
     sqrshrnb        z0.b, z0.h, #7
     add             z0.b, z0.b, #0x80
-    st1b            {z0.h}, p1, [x2]
-    add             x2, x2, x5
-    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
-    ret
-.vl_gt_32_addAvg_16x\h\():
-    mov             x10, #48
-    mov             x11, #0
-    whilelt         p0.b, x11, x10
-.Loop_gt_32_sve2_addavg_16x\h\():
-    sub             w12, w12, #1
-    ld1b            {z0.b}, p0/z, [x0]
-    add             x0, x0, x3, lsl #1
-    add             x1, x1, x4, lsl #1
-    add             z0.h, p0/m, z0.h, z1.h
-    sqrshrnb        z0.b, z0.h, #7
-    add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -674,9 +656,9 @@ function PFX(addAvg_32x\h\()_sve2)
     add             z0.h, p0/m, z0.h, z2.h
     add             z1.h, p0/m, z1.h, z3.h
     sqrshrnb        z0.b, z0.h, #7
-    add             z1.b, z1.b, #0x80
-    sqrshrnb        z1.b, z1.h, #7
     add             z0.b, z0.b, #0x80
+    sqrshrnb        z1.b, z1.h, #7
+    add             z1.b, z1.b, #0x80
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-mc-a-sve2.S-Fix-addAvg_-16-32-xh_sve2-for-longer-SVE.patch
Type: text/x-diff
Size: 3053 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250106/1129b2f2/attachment.patch>