[x265] [PATCH 1/8] AArch64: Optimise Neon assembly implementations of SAD
Hari Limaye
hari.limaye at arm.com
Thu May 23 17:12:33 UTC 2024
Optimise the Neon assembly implementations of SAD primitives, replacing
UABAL, UABAL2 sequences with UABD, UADALP which has twice the throughput
on modern Arm cores.
Also refactor the load instructions for block sizes of width 4 to use
LDR for the first partial load of a vector register - making the
operation completely destructive.
As this patch refactors some of the block sizes (16xh) to use the
LOOP macro (rather than the fully unrolled macro), the SVE2
implementations which make use of these Neon macros are updated as
required.
Change-Id: I2ddba7eb729a71e5d275f37affc4cd5c3aa88bb3
---
source/common/aarch64/sad-a-common.S | 161 +++++++++++++--------------
source/common/aarch64/sad-a-sve2.S | 21 ++--
source/common/aarch64/sad-a.S | 21 ++--
3 files changed, 97 insertions(+), 106 deletions(-)
diff --git a/source/common/aarch64/sad-a-common.S b/source/common/aarch64/sad-a-common.S
index 572484a06..d8644f694 100644
--- a/source/common/aarch64/sad-a-common.S
+++ b/source/common/aarch64/sad-a-common.S
@@ -1,7 +1,8 @@
/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
+ * Copyright (C) 2022-2024 MulticoreWare, Inc
*
* Authors: David Chen <david.chen at myais.com.cn>
+ Hari Limaye <hari.limaye at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -37,9 +38,11 @@
.align 4
.macro SAD_START_4 f
- ld1 {v0.s}[0], [x0], x1
+ ldr s0, [x0]
+ ldr s1, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[0], [x2], x3
ld1 {v1.s}[1], [x2], x3
\f v16.8h, v0.8b, v1.8b
.endm
@@ -65,21 +68,27 @@
.endr
.endm
-.macro SAD_START_16 f
+.macro SAD_START_16
+ movi v16.16b, #0
+ movi v17.16b, #0
+.endm
+
+.macro SAD_16
ld1 {v0.16b}, [x0], x1
ld1 {v1.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
- \f v16.8h, v0.8b, v1.8b
- \f\()2 v17.8h, v0.16b, v1.16b
- uabal v16.8h, v2.8b, v3.8b
- uabal2 v17.8h, v2.16b, v3.16b
+ uabd v20.16b, v0.16b, v1.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v2.16b, v3.16b
+ uadalp v17.8h, v21.16b
.endm
-.macro SAD_16 h
-.rept \h / 2 - 1
- SAD_START_16 uabal
-.endr
+.macro SAD_END_16
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+ fmov x0, d0
+ ret
.endm
.macro SAD_START_32
@@ -94,14 +103,14 @@
ld1 {v2.16b-v3.16b}, [x2], x3
ld1 {v4.16b-v5.16b}, [x0], x1
ld1 {v6.16b-v7.16b}, [x2], x3
- uabal v16.8h, v0.8b, v2.8b
- uabal2 v17.8h, v0.16b, v2.16b
- uabal v18.8h, v1.8b, v3.8b
- uabal2 v19.8h, v1.16b, v3.16b
- uabal v16.8h, v4.8b, v6.8b
- uabal2 v17.8h, v4.16b, v6.16b
- uabal v18.8h, v5.8b, v7.8b
- uabal2 v19.8h, v5.16b, v7.16b
+ uabd v20.16b, v0.16b, v2.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v3.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v4.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v5.16b, v7.16b
+ uadalp v19.8h, v23.16b
.endm
.macro SAD_END_32
@@ -118,10 +127,6 @@
movi v17.16b, #0
movi v18.16b, #0
movi v19.16b, #0
- movi v20.16b, #0
- movi v21.16b, #0
- movi v22.16b, #0
- movi v23.16b, #0
.endm
.macro SAD_64
@@ -129,35 +134,29 @@
ld1 {v4.16b-v7.16b}, [x2], x3
ld1 {v24.16b-v27.16b}, [x0], x1
ld1 {v28.16b-v31.16b}, [x2], x3
- uabal v16.8h, v0.8b, v4.8b
- uabal2 v17.8h, v0.16b, v4.16b
- uabal v18.8h, v1.8b, v5.8b
- uabal2 v19.8h, v1.16b, v5.16b
- uabal v20.8h, v2.8b, v6.8b
- uabal2 v21.8h, v2.16b, v6.16b
- uabal v22.8h, v3.8b, v7.8b
- uabal2 v23.8h, v3.16b, v7.16b
-
- uabal v16.8h, v24.8b, v28.8b
- uabal2 v17.8h, v24.16b, v28.16b
- uabal v18.8h, v25.8b, v29.8b
- uabal2 v19.8h, v25.16b, v29.16b
- uabal v20.8h, v26.8b, v30.8b
- uabal2 v21.8h, v26.16b, v30.16b
- uabal v22.8h, v27.8b, v31.8b
- uabal2 v23.8h, v27.16b, v31.16b
+ uabd v20.16b, v0.16b, v4.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v5.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v2.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v3.16b, v7.16b
+ uadalp v19.8h, v23.16b
+ uabd v20.16b, v24.16b, v28.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v25.16b, v29.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v26.16b, v30.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v27.16b, v31.16b
+ uadalp v19.8h, v23.16b
.endm
.macro SAD_END_64
- add v16.8h, v16.8h, v17.8h
- add v17.8h, v18.8h, v19.8h
- add v16.8h, v16.8h, v17.8h
uaddlp v16.4s, v16.8h
- add v18.8h, v20.8h, v21.8h
- add v19.8h, v22.8h, v23.8h
- add v17.8h, v18.8h, v19.8h
- uaddlp v17.4s, v17.8h
- add v16.4s, v16.4s, v17.4s
+ uadalp v16.4s, v17.8h
+ uadalp v16.4s, v18.8h
+ uadalp v16.4s, v19.8h
uaddlv d0, v16.4s
fmov x0, d0
ret
@@ -179,10 +178,10 @@
and v2.16b, v2.16b, v31.16b
ld1 {v3.16b}, [x2], x3
and v3.16b, v3.16b, v31.16b
- uabal v16.8h, v0.8b, v1.8b
- uabal2 v17.8h, v0.16b, v1.16b
- uabal v16.8h, v2.8b, v3.8b
- uabal2 v17.8h, v2.16b, v3.16b
+ uabd v20.16b, v0.16b, v1.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v2.16b, v3.16b
+ uadalp v17.8h, v21.16b
.endm
.macro SAD_END_12
@@ -195,7 +194,6 @@
.macro SAD_START_24
movi v16.16b, #0
movi v17.16b, #0
- movi v18.16b, #0
sub x1, x1, #16
sub x3, x3, #16
.endm
@@ -209,17 +207,16 @@
ld1 {v5.8b}, [x0], x1
ld1 {v6.16b}, [x2], #16
ld1 {v7.8b}, [x2], x3
- uabal v16.8h, v0.8b, v2.8b
- uabal2 v17.8h, v0.16b, v2.16b
- uabal v18.8h, v1.8b, v3.8b
- uabal v16.8h, v4.8b, v6.8b
- uabal2 v17.8h, v4.16b, v6.16b
- uabal v18.8h, v5.8b, v7.8b
+ uabd v20.16b, v0.16b, v2.16b
+ uadalp v16.8h, v20.16b
+ uabal v17.8h, v1.8b, v3.8b
+ uabd v20.16b, v4.16b, v6.16b
+ uadalp v16.8h, v20.16b
+ uabal v17.8h, v5.8b, v7.8b
.endm
.macro SAD_END_24
add v16.8h, v16.8h, v17.8h
- add v16.8h, v16.8h, v18.8h
uaddlv s0, v16.8h
fmov w0, s0
ret
@@ -229,9 +226,6 @@
movi v16.16b, #0
movi v17.16b, #0
movi v18.16b, #0
- movi v19.16b, #0
- movi v20.16b, #0
- movi v21.16b, #0
.endm
.macro SAD_48
@@ -239,31 +233,26 @@
ld1 {v4.16b-v6.16b}, [x2], x3
ld1 {v24.16b-v26.16b}, [x0], x1
ld1 {v28.16b-v30.16b}, [x2], x3
- uabal v16.8h, v0.8b, v4.8b
- uabal2 v17.8h, v0.16b, v4.16b
- uabal v18.8h, v1.8b, v5.8b
- uabal2 v19.8h, v1.16b, v5.16b
- uabal v20.8h, v2.8b, v6.8b
- uabal2 v21.8h, v2.16b, v6.16b
-
- uabal v16.8h, v24.8b, v28.8b
- uabal2 v17.8h, v24.16b, v28.16b
- uabal v18.8h, v25.8b, v29.8b
- uabal2 v19.8h, v25.16b, v29.16b
- uabal v20.8h, v26.8b, v30.8b
- uabal2 v21.8h, v26.16b, v30.16b
+ uabd v20.16b, v0.16b, v4.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v5.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v2.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v20.16b, v24.16b, v28.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v25.16b, v29.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v26.16b, v30.16b
+ uadalp v18.8h, v22.16b
.endm
.macro SAD_END_48
- add v16.8h, v16.8h, v17.8h
- add v17.8h, v18.8h, v19.8h
- add v16.8h, v16.8h, v17.8h
- uaddlv s0, v16.8h
- fmov w0, s0
- add v18.8h, v20.8h, v21.8h
- uaddlv s1, v18.8h
- fmov w1, s1
- add w0, w0, w1
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ uadalp v16.4s, v18.8h
+ uaddlv d0, v16.4s
+ fmov x0, d0
ret
.endm
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S
index 599a3719a..f98ecb137 100644
--- a/source/common/aarch64/sad-a-sve2.S
+++ b/source/common/aarch64/sad-a-sve2.S
@@ -1,7 +1,8 @@
/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
+ * Copyright (C) 2022-2024 MulticoreWare, Inc
*
* Authors: David Chen <david.chen at myais.com.cn>
+ Hari Limaye <hari.limaye at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -208,7 +209,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
endfunc
.endm
-// Loop unrolled 4.
+// Loop unrolled to process 4 rows per iteration.
.macro SAD_FUNC_LOOP_SVE2 w, h
function PFX(pixel_sad_\w\()x\h\()_sve2)
rdvl x9, #1
@@ -216,10 +217,10 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
bgt .vl_gt_16_pixel_sad_loop_\w\()x\h
SAD_START_\w
- mov w9, #\h/8
+ mov w9, #\h/4
.Loop_sve2_\w\()x\h:
sub w9, w9, #1
-.rept 4
+.rept 2
SAD_\w
.endr
cbnz w9, .Loop_sve2_\w\()x\h
@@ -252,13 +253,13 @@ SAD_FUNC_SVE2 8, 4
SAD_FUNC_SVE2 8, 8
SAD_FUNC_SVE2 8, 16
SAD_FUNC_SVE2 8, 32
-SAD_FUNC_SVE2 16, 4
-SAD_FUNC_SVE2 16, 8
-SAD_FUNC_SVE2 16, 12
-SAD_FUNC_SVE2 16, 16
-SAD_FUNC_SVE2 16, 32
-SAD_FUNC_SVE2 16, 64
+SAD_FUNC_LOOP_SVE2 16, 4
+SAD_FUNC_LOOP_SVE2 16, 8
+SAD_FUNC_LOOP_SVE2 16, 12
+SAD_FUNC_LOOP_SVE2 16, 16
+SAD_FUNC_LOOP_SVE2 16, 32
+SAD_FUNC_LOOP_SVE2 16, 64
SAD_FUNC_LOOP_SVE2 32, 8
SAD_FUNC_LOOP_SVE2 32, 16
SAD_FUNC_LOOP_SVE2 32, 24
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 7460825f1..2fd2c2cc8 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -1,8 +1,9 @@
/*****************************************************************************
- * Copyright (C) 2020-2021 MulticoreWare, Inc
+ * Copyright (C) 2020-2024 MulticoreWare, Inc
*
* Authors: Hongbin Liu <liuhongbin1 at huawei.com>
* Sebastian Pop <spop at amazon.com>
+ Hari Limaye <hari.limaye at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -49,15 +50,15 @@ function PFX(pixel_sad_\w\()x\h\()_neon)
endfunc
.endm
-// Loop unrolled 4.
+// Loop unrolled to process 4 rows per iteration.
.macro SAD_FUNC_LOOP w, h
function PFX(pixel_sad_\w\()x\h\()_neon)
SAD_START_\w
- mov w9, #\h/8
+ mov w9, #\h/4
.Loop_\w\()x\h:
sub w9, w9, #1
-.rept 4
+.rept 2
SAD_\w
.endr
cbnz w9, .Loop_\w\()x\h
@@ -73,13 +74,13 @@ SAD_FUNC 8, 4
SAD_FUNC 8, 8
SAD_FUNC 8, 16
SAD_FUNC 8, 32
-SAD_FUNC 16, 4
-SAD_FUNC 16, 8
-SAD_FUNC 16, 12
-SAD_FUNC 16, 16
-SAD_FUNC 16, 32
-SAD_FUNC 16, 64
+SAD_FUNC_LOOP 16, 4
+SAD_FUNC_LOOP 16, 8
+SAD_FUNC_LOOP 16, 12
+SAD_FUNC_LOOP 16, 16
+SAD_FUNC_LOOP 16, 32
+SAD_FUNC_LOOP 16, 64
SAD_FUNC_LOOP 32, 8
SAD_FUNC_LOOP 32, 16
SAD_FUNC_LOOP 32, 24
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Optimise-Neon-assembly-implementations-of-SA.patch
Type: text/x-patch
Size: 13228 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240523/49129187/attachment-0001.bin>
More information about the x265-devel
mailing list