[x265] [PATCH] AArch64: Optimize pixel_avg_pp_4xh

Thu Jun 19 14:58:53 UTC 2025

Use LDR and STR instead of LD1 to lane in the pixel_avg_pp_4xh assembly
implementation. The new approach is a wholly destructive operation and
removes a false dependency on the existing register contents.

The change provides up to 2.5x speed up.
---
 source/common/aarch64/mc-a.S | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..ff18713fa 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -38,10 +38,13 @@
 .macro pixel_avg_pp_4xN_neon h
 function PFX(pixel_avg_pp_4x\h\()_neon)
 .rept \h
-    ld1             {v0.s}[0], [x2], x3
-    ld1             {v1.s}[0], [x4], x5
+    ldr             s0, [x2]
+    ldr             s1, [x4]
+    add             x2, x2, x3
+    add             x4, x4, x5
     urhadd          v2.8b, v0.8b, v1.8b
-    st1             {v2.s}[0], [x0], x1
+    str             s2, [x0]
+    add             x0, x0, x1
 .endr
     ret
 endfunc
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From 1868974d4b481cfa2166f681a1ed89deb5b47122 Mon Sep 17 00:00:00 2001
Message-Id: <1868974d4b481cfa2166f681a1ed89deb5b47122.1750344966.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 16 Jun 2025 16:35:28 +0200
Subject: [PATCH] AArch64: Optimize pixel_avg_pp_4xh

Use LDR and STR instead of LD1 to lane in the pixel_avg_pp_4xh assembly
implementation. The new approach is a wholly destructive operation and
removes a false dependency on the existing register contents.

The change provides up to 2.5x speed up.
---
 source/common/aarch64/mc-a.S | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..ff18713fa 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -38,10 +38,13 @@
 .macro pixel_avg_pp_4xN_neon h
 function PFX(pixel_avg_pp_4x\h\()_neon)
 .rept \h
-    ld1             {v0.s}[0], [x2], x3
-    ld1             {v1.s}[0], [x4], x5
+    ldr             s0, [x2]
+    ldr             s1, [x4]
+    add             x2, x2, x3
+    add             x4, x4, x5
     urhadd          v2.8b, v0.8b, v1.8b
-    st1             {v2.s}[0], [x0], x1
+    str             s2, [x0]
+    add             x0, x0, x1
 .endr
     ret
 endfunc
-- 
2.39.5 (Apple Git-154)