[x265] [PATCH] AArch64: Optimize pixel_avg_pp_4xh
Li Zhang
li.zhang2 at arm.com
Thu Jun 19 14:58:53 UTC 2025
Use LDR and STR instead of LD1 to lane in the pixel_avg_pp_4xh assembly
implementation. The new approach is a wholly destructive operation and
removes a false dependency on the existing register contents.
The change provides up to 2.5x speed up.
---
source/common/aarch64/mc-a.S | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..ff18713fa 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -38,10 +38,13 @@
.macro pixel_avg_pp_4xN_neon h
function PFX(pixel_avg_pp_4x\h\()_neon)
.rept \h
- ld1 {v0.s}[0], [x2], x3
- ld1 {v1.s}[0], [x4], x5
+ ldr s0, [x2]
+ ldr s1, [x4]
+ add x2, x2, x3
+ add x4, x4, x5
urhadd v2.8b, v0.8b, v1.8b
- st1 {v2.s}[0], [x0], x1
+ str s2, [x0]
+ add x0, x0, x1
.endr
ret
endfunc
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 1868974d4b481cfa2166f681a1ed89deb5b47122 Mon Sep 17 00:00:00 2001
Message-Id: <1868974d4b481cfa2166f681a1ed89deb5b47122.1750344966.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 16 Jun 2025 16:35:28 +0200
Subject: [PATCH] AArch64: Optimize pixel_avg_pp_4xh
Use LDR and STR instead of LD1 to lane in the pixel_avg_pp_4xh assembly
implementation. The new approach is a wholly destructive operation and
removes a false dependency on the existing register contents.
The change provides up to 2.5x speed up.
---
source/common/aarch64/mc-a.S | 9 ++++++---
1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..ff18713fa 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -38,10 +38,13 @@
.macro pixel_avg_pp_4xN_neon h
function PFX(pixel_avg_pp_4x\h\()_neon)
.rept \h
- ld1 {v0.s}[0], [x2], x3
- ld1 {v1.s}[0], [x4], x5
+ ldr s0, [x2]
+ ldr s1, [x4]
+ add x2, x2, x3
+ add x4, x4, x5
urhadd v2.8b, v0.8b, v1.8b
- st1 {v2.s}[0], [x0], x1
+ str s2, [x0]
+ add x0, x0, x1
.endr
ret
endfunc
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list