<div dir="ltr"><span style="color:rgb(0,0,0);font-family:arial;font-size:14px">All the patches of this series have been pushed to the master branch.</span><br style="color:rgb(0,0,0);font-family:arial;font-size:14px"><div><div dir="ltr" class="gmail_signature"><div dir="ltr"></div></div></div><div><div dir="ltr" class="gmail_signature" data-smartmail="gmail_signature"><div dir="ltr"><div><b>__________________________</b></div><div><b>Karam Singh</b></div><div><b>Ph.D. IIT Guwahati</b></div><div><font size="1">Senior Software (Video Coding) Engineer </font></div><div><font size="1">Mobile: +91 8011279030</font></div><div><font size="1">Block 9A, 6th floor, DLF Cyber City</font></div><div><font size="1">Manapakkam, Chennai 600 089</font></div></div></div></div><br></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Tue, Jul 30, 2024 at 9:14 PM Hari Limaye <<a href="mailto:hari.limaye@arm.com">hari.limaye@arm.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">Optimise the Neon assembly implementations of SAD primitives, replacing<br>
UABAL, UABAL2 sequences with UABD, UADALP which has twice the throughput<br>
on modern Arm cores.<br>
<br>
Also refactor the load instructions for block sizes of width 4 to use<br>
LDR for the first partial load of a vector register - making the<br>
operation completely destructive.<br>
<br>
As this patch refactors some of the block sizes (16xh) to use the<br>
LOOP macro (rather than the fully unrolled macro), the SVE2<br>
implementations which make use of these Neon macros are updated as<br>
required.<br>
---<br>
source/common/aarch64/sad-a-common.S | 172 +++++++++++++--------------<br>
source/common/aarch64/sad-a-sve2.S | 25 ++--<br>
source/common/aarch64/sad-a.S | 23 ++--<br>
3 files changed, 107 insertions(+), 113 deletions(-)<br>
<br>
diff --git a/source/common/aarch64/sad-a-common.S b/source/common/aarch64/sad-a-common.S<br>
index 572484a06..f7ce264a1 100644<br>
--- a/source/common/aarch64/sad-a-common.S<br>
+++ b/source/common/aarch64/sad-a-common.S<br>
@@ -1,7 +1,8 @@<br>
/*****************************************************************************<br>
- * Copyright (C) 2022-2023 MulticoreWare, Inc<br>
+ * Copyright (C) 2022-2024 MulticoreWare, Inc<br>
*<br>
* Authors: David Chen <<a href="mailto:david.chen@myais.com.cn" target="_blank">david.chen@myais.com.cn</a>><br>
+ Hari Limaye <<a href="mailto:hari.limaye@arm.com" target="_blank">hari.limaye@arm.com</a>><br>
*<br>
* This program is free software; you can redistribute it and/or modify<br>
* it under the terms of the GNU General Public License as published by<br>
@@ -37,9 +38,11 @@<br>
.align 4<br>
<br>
.macro SAD_START_4 f<br>
- ld1 {v0.s}[0], [x0], x1<br>
+ ldr s0, [x0]<br>
+ ldr s1, [x2]<br>
+ add x0, x0, x1<br>
+ add x2, x2, x3<br>
ld1 {v0.s}[1], [x0], x1<br>
- ld1 {v1.s}[0], [x2], x3<br>
ld1 {v1.s}[1], [x2], x3<br>
\f v16.8h, v0.8b, v1.8b<br>
.endm<br>
@@ -53,33 +56,42 @@<br>
.macro SAD_START_8 f<br>
ld1 {v0.8b}, [x0], x1<br>
ld1 {v1.8b}, [x2], x3<br>
- ld1 {v2.8b}, [x0], x1<br>
- ld1 {v3.8b}, [x2], x3<br>
\f v16.8h, v0.8b, v1.8b<br>
- \f v17.8h, v2.8b, v3.8b<br>
.endm<br>
<br>
.macro SAD_8 h<br>
-.rept \h / 2 - 1<br>
+.rept \h - 3<br>
SAD_START_8 uabal<br>
.endr<br>
+ ldr d0, [x0]<br>
+ ldr d1, [x2]<br>
+ uabal v16.8h, v0.8b, v1.8b<br>
+ ldr d0, [x0, x1]<br>
+ ldr d1, [x2, x3]<br>
+ uabal v16.8h, v0.8b, v1.8b<br>
+.endm<br>
+<br>
+.macro SAD_START_16<br>
+ movi v16.16b, #0<br>
+ movi v17.16b, #0<br>
.endm<br>
<br>
-.macro SAD_START_16 f<br>
+.macro SAD_16<br>
ld1 {v0.16b}, [x0], x1<br>
ld1 {v1.16b}, [x2], x3<br>
ld1 {v2.16b}, [x0], x1<br>
ld1 {v3.16b}, [x2], x3<br>
- \f v16.8h, v0.8b, v1.8b<br>
- \f\()2 v17.8h, v0.16b, v1.16b<br>
- uabal v16.8h, v2.8b, v3.8b<br>
- uabal2 v17.8h, v2.16b, v3.16b<br>
+ uabd v20.16b, v0.16b, v1.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v2.16b, v3.16b<br>
+ uadalp v17.8h, v21.16b<br>
.endm<br>
<br>
-.macro SAD_16 h<br>
-.rept \h / 2 - 1<br>
- SAD_START_16 uabal<br>
-.endr<br>
+.macro SAD_END_16<br>
+ add v16.8h, v16.8h, v17.8h<br>
+ uaddlv s0, v16.8h<br>
+ fmov x0, d0<br>
+ ret<br>
.endm<br>
<br>
.macro SAD_START_32<br>
@@ -94,14 +106,14 @@<br>
ld1 {v2.16b-v3.16b}, [x2], x3<br>
ld1 {v4.16b-v5.16b}, [x0], x1<br>
ld1 {v6.16b-v7.16b}, [x2], x3<br>
- uabal v16.8h, v0.8b, v2.8b<br>
- uabal2 v17.8h, v0.16b, v2.16b<br>
- uabal v18.8h, v1.8b, v3.8b<br>
- uabal2 v19.8h, v1.16b, v3.16b<br>
- uabal v16.8h, v4.8b, v6.8b<br>
- uabal2 v17.8h, v4.16b, v6.16b<br>
- uabal v18.8h, v5.8b, v7.8b<br>
- uabal2 v19.8h, v5.16b, v7.16b<br>
+ uabd v20.16b, v0.16b, v2.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v1.16b, v3.16b<br>
+ uadalp v17.8h, v21.16b<br>
+ uabd v22.16b, v4.16b, v6.16b<br>
+ uadalp v18.8h, v22.16b<br>
+ uabd v23.16b, v5.16b, v7.16b<br>
+ uadalp v19.8h, v23.16b<br>
.endm<br>
<br>
.macro SAD_END_32<br>
@@ -118,10 +130,6 @@<br>
movi v17.16b, #0<br>
movi v18.16b, #0<br>
movi v19.16b, #0<br>
- movi v20.16b, #0<br>
- movi v21.16b, #0<br>
- movi v22.16b, #0<br>
- movi v23.16b, #0<br>
.endm<br>
<br>
.macro SAD_64<br>
@@ -129,35 +137,29 @@<br>
ld1 {v4.16b-v7.16b}, [x2], x3<br>
ld1 {v24.16b-v27.16b}, [x0], x1<br>
ld1 {v28.16b-v31.16b}, [x2], x3<br>
- uabal v16.8h, v0.8b, v4.8b<br>
- uabal2 v17.8h, v0.16b, v4.16b<br>
- uabal v18.8h, v1.8b, v5.8b<br>
- uabal2 v19.8h, v1.16b, v5.16b<br>
- uabal v20.8h, v2.8b, v6.8b<br>
- uabal2 v21.8h, v2.16b, v6.16b<br>
- uabal v22.8h, v3.8b, v7.8b<br>
- uabal2 v23.8h, v3.16b, v7.16b<br>
-<br>
- uabal v16.8h, v24.8b, v28.8b<br>
- uabal2 v17.8h, v24.16b, v28.16b<br>
- uabal v18.8h, v25.8b, v29.8b<br>
- uabal2 v19.8h, v25.16b, v29.16b<br>
- uabal v20.8h, v26.8b, v30.8b<br>
- uabal2 v21.8h, v26.16b, v30.16b<br>
- uabal v22.8h, v27.8b, v31.8b<br>
- uabal2 v23.8h, v27.16b, v31.16b<br>
+ uabd v20.16b, v0.16b, v4.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v1.16b, v5.16b<br>
+ uadalp v17.8h, v21.16b<br>
+ uabd v22.16b, v2.16b, v6.16b<br>
+ uadalp v18.8h, v22.16b<br>
+ uabd v23.16b, v3.16b, v7.16b<br>
+ uadalp v19.8h, v23.16b<br>
+ uabd v20.16b, v24.16b, v28.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v25.16b, v29.16b<br>
+ uadalp v17.8h, v21.16b<br>
+ uabd v22.16b, v26.16b, v30.16b<br>
+ uadalp v18.8h, v22.16b<br>
+ uabd v23.16b, v27.16b, v31.16b<br>
+ uadalp v19.8h, v23.16b<br>
.endm<br>
<br>
.macro SAD_END_64<br>
- add v16.8h, v16.8h, v17.8h<br>
- add v17.8h, v18.8h, v19.8h<br>
- add v16.8h, v16.8h, v17.8h<br>
uaddlp v16.4s, v16.8h<br>
- add v18.8h, v20.8h, v21.8h<br>
- add v19.8h, v22.8h, v23.8h<br>
- add v17.8h, v18.8h, v19.8h<br>
- uaddlp v17.4s, v17.8h<br>
- add v16.4s, v16.4s, v17.4s<br>
+ uadalp v16.4s, v17.8h<br>
+ uadalp v16.4s, v18.8h<br>
+ uadalp v16.4s, v19.8h<br>
uaddlv d0, v16.4s<br>
fmov x0, d0<br>
ret<br>
@@ -179,10 +181,10 @@<br>
and v2.16b, v2.16b, v31.16b<br>
ld1 {v3.16b}, [x2], x3<br>
and v3.16b, v3.16b, v31.16b<br>
- uabal v16.8h, v0.8b, v1.8b<br>
- uabal2 v17.8h, v0.16b, v1.16b<br>
- uabal v16.8h, v2.8b, v3.8b<br>
- uabal2 v17.8h, v2.16b, v3.16b<br>
+ uabd v20.16b, v0.16b, v1.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v2.16b, v3.16b<br>
+ uadalp v17.8h, v21.16b<br>
.endm<br>
<br>
.macro SAD_END_12<br>
@@ -195,7 +197,6 @@<br>
.macro SAD_START_24<br>
movi v16.16b, #0<br>
movi v17.16b, #0<br>
- movi v18.16b, #0<br>
sub x1, x1, #16<br>
sub x3, x3, #16<br>
.endm<br>
@@ -209,17 +210,16 @@<br>
ld1 {v5.8b}, [x0], x1<br>
ld1 {v6.16b}, [x2], #16<br>
ld1 {v7.8b}, [x2], x3<br>
- uabal v16.8h, v0.8b, v2.8b<br>
- uabal2 v17.8h, v0.16b, v2.16b<br>
- uabal v18.8h, v1.8b, v3.8b<br>
- uabal v16.8h, v4.8b, v6.8b<br>
- uabal2 v17.8h, v4.16b, v6.16b<br>
- uabal v18.8h, v5.8b, v7.8b<br>
+ uabd v20.16b, v0.16b, v2.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabal v17.8h, v1.8b, v3.8b<br>
+ uabd v20.16b, v4.16b, v6.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabal v17.8h, v5.8b, v7.8b<br>
.endm<br>
<br>
.macro SAD_END_24<br>
add v16.8h, v16.8h, v17.8h<br>
- add v16.8h, v16.8h, v18.8h<br>
uaddlv s0, v16.8h<br>
fmov w0, s0<br>
ret<br>
@@ -229,9 +229,6 @@<br>
movi v16.16b, #0<br>
movi v17.16b, #0<br>
movi v18.16b, #0<br>
- movi v19.16b, #0<br>
- movi v20.16b, #0<br>
- movi v21.16b, #0<br>
.endm<br>
<br>
.macro SAD_48<br>
@@ -239,31 +236,26 @@<br>
ld1 {v4.16b-v6.16b}, [x2], x3<br>
ld1 {v24.16b-v26.16b}, [x0], x1<br>
ld1 {v28.16b-v30.16b}, [x2], x3<br>
- uabal v16.8h, v0.8b, v4.8b<br>
- uabal2 v17.8h, v0.16b, v4.16b<br>
- uabal v18.8h, v1.8b, v5.8b<br>
- uabal2 v19.8h, v1.16b, v5.16b<br>
- uabal v20.8h, v2.8b, v6.8b<br>
- uabal2 v21.8h, v2.16b, v6.16b<br>
-<br>
- uabal v16.8h, v24.8b, v28.8b<br>
- uabal2 v17.8h, v24.16b, v28.16b<br>
- uabal v18.8h, v25.8b, v29.8b<br>
- uabal2 v19.8h, v25.16b, v29.16b<br>
- uabal v20.8h, v26.8b, v30.8b<br>
- uabal2 v21.8h, v26.16b, v30.16b<br>
+ uabd v20.16b, v0.16b, v4.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v1.16b, v5.16b<br>
+ uadalp v17.8h, v21.16b<br>
+ uabd v22.16b, v2.16b, v6.16b<br>
+ uadalp v18.8h, v22.16b<br>
+ uabd v20.16b, v24.16b, v28.16b<br>
+ uadalp v16.8h, v20.16b<br>
+ uabd v21.16b, v25.16b, v29.16b<br>
+ uadalp v17.8h, v21.16b<br>
+ uabd v22.16b, v26.16b, v30.16b<br>
+ uadalp v18.8h, v22.16b<br>
.endm<br>
<br>
.macro SAD_END_48<br>
- add v16.8h, v16.8h, v17.8h<br>
- add v17.8h, v18.8h, v19.8h<br>
- add v16.8h, v16.8h, v17.8h<br>
- uaddlv s0, v16.8h<br>
- fmov w0, s0<br>
- add v18.8h, v20.8h, v21.8h<br>
- uaddlv s1, v18.8h<br>
- fmov w1, s1<br>
- add w0, w0, w1<br>
+ uaddlp v16.4s, v16.8h<br>
+ uadalp v16.4s, v17.8h<br>
+ uadalp v16.4s, v18.8h<br>
+ uaddlv d0, v16.4s<br>
+ fmov x0, d0<br>
ret<br>
.endm<br>
<br>
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S<br>
index 599a3719a..325dc3f68 100644<br>
--- a/source/common/aarch64/sad-a-sve2.S<br>
+++ b/source/common/aarch64/sad-a-sve2.S<br>
@@ -1,7 +1,8 @@<br>
/*****************************************************************************<br>
- * Copyright (C) 2022-2023 MulticoreWare, Inc<br>
+ * Copyright (C) 2022-2024 MulticoreWare, Inc<br>
*<br>
* Authors: David Chen <<a href="mailto:david.chen@myais.com.cn" target="_blank">david.chen@myais.com.cn</a>><br>
+ Hari Limaye <<a href="mailto:hari.limaye@arm.com" target="_blank">hari.limaye@arm.com</a>><br>
*<br>
* This program is free software; you can redistribute it and/or modify<br>
* it under the terms of the GNU General Public License as published by<br>
@@ -186,7 +187,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)<br>
bgt .vl_gt_16_pixel_sad_\w\()x\h<br>
SAD_START_\w uabdl<br>
SAD_\w \h<br>
-.if \w > 4<br>
+.if \w > 8<br>
add v16.8h, v16.8h, v17.8h<br>
.endif<br>
uaddlv s0, v16.8h<br>
@@ -196,7 +197,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)<br>
.if \w == 4 || \w == 8 || \w == 12<br>
SAD_START_\w uabdl<br>
SAD_\w \h<br>
-.if \w > 4<br>
+.if \w > 8<br>
add v16.8h, v16.8h, v17.8h<br>
.endif<br>
uaddlv s0, v16.8h<br>
@@ -208,7 +209,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)<br>
endfunc<br>
.endm<br>
<br>
-// Loop unrolled 4.<br>
+// Loop unrolled to process 4 rows per iteration.<br>
.macro SAD_FUNC_LOOP_SVE2 w, h<br>
function PFX(pixel_sad_\w\()x\h\()_sve2)<br>
rdvl x9, #1<br>
@@ -216,10 +217,10 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)<br>
bgt .vl_gt_16_pixel_sad_loop_\w\()x\h<br>
SAD_START_\w<br>
<br>
- mov w9, #\h/8<br>
+ mov w9, #\h/4<br>
.Loop_sve2_\w\()x\h:<br>
sub w9, w9, #1<br>
-.rept 4<br>
+.rept 2<br>
SAD_\w<br>
.endr<br>
cbnz w9, .Loop_sve2_\w\()x\h<br>
@@ -252,13 +253,13 @@ SAD_FUNC_SVE2 8, 4<br>
SAD_FUNC_SVE2 8, 8<br>
SAD_FUNC_SVE2 8, 16<br>
SAD_FUNC_SVE2 8, 32<br>
-SAD_FUNC_SVE2 16, 4<br>
-SAD_FUNC_SVE2 16, 8<br>
-SAD_FUNC_SVE2 16, 12<br>
-SAD_FUNC_SVE2 16, 16<br>
-SAD_FUNC_SVE2 16, 32<br>
-SAD_FUNC_SVE2 16, 64<br>
<br>
+SAD_FUNC_LOOP_SVE2 16, 4<br>
+SAD_FUNC_LOOP_SVE2 16, 8<br>
+SAD_FUNC_LOOP_SVE2 16, 12<br>
+SAD_FUNC_LOOP_SVE2 16, 16<br>
+SAD_FUNC_LOOP_SVE2 16, 32<br>
+SAD_FUNC_LOOP_SVE2 16, 64<br>
SAD_FUNC_LOOP_SVE2 32, 8<br>
SAD_FUNC_LOOP_SVE2 32, 16<br>
SAD_FUNC_LOOP_SVE2 32, 24<br>
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S<br>
index 7460825f1..b4b8e4cd9 100644<br>
--- a/source/common/aarch64/sad-a.S<br>
+++ b/source/common/aarch64/sad-a.S<br>
@@ -1,8 +1,9 @@<br>
/*****************************************************************************<br>
- * Copyright (C) 2020-2021 MulticoreWare, Inc<br>
+ * Copyright (C) 2020-2024 MulticoreWare, Inc<br>
*<br>
* Authors: Hongbin Liu <<a href="mailto:liuhongbin1@huawei.com" target="_blank">liuhongbin1@huawei.com</a>><br>
* Sebastian Pop <<a href="mailto:spop@amazon.com" target="_blank">spop@amazon.com</a>><br>
+ Hari Limaye <<a href="mailto:hari.limaye@arm.com" target="_blank">hari.limaye@arm.com</a>><br>
*<br>
* This program is free software; you can redistribute it and/or modify<br>
* it under the terms of the GNU General Public License as published by<br>
@@ -40,7 +41,7 @@<br>
function PFX(pixel_sad_\w\()x\h\()_neon)<br>
SAD_START_\w uabdl<br>
SAD_\w \h<br>
-.if \w > 4<br>
+.if \w > 8<br>
add v16.8h, v16.8h, v17.8h<br>
.endif<br>
uaddlv s0, v16.8h<br>
@@ -49,15 +50,15 @@ function PFX(pixel_sad_\w\()x\h\()_neon)<br>
endfunc<br>
.endm<br>
<br>
-// Loop unrolled 4.<br>
+// Loop unrolled to process 4 rows per iteration.<br>
.macro SAD_FUNC_LOOP w, h<br>
function PFX(pixel_sad_\w\()x\h\()_neon)<br>
SAD_START_\w<br>
<br>
- mov w9, #\h/8<br>
+ mov w9, #\h/4<br>
.Loop_\w\()x\h:<br>
sub w9, w9, #1<br>
-.rept 4<br>
+.rept 2<br>
SAD_\w<br>
.endr<br>
cbnz w9, .Loop_\w\()x\h<br>
@@ -73,13 +74,13 @@ SAD_FUNC 8, 4<br>
SAD_FUNC 8, 8<br>
SAD_FUNC 8, 16<br>
SAD_FUNC 8, 32<br>
-SAD_FUNC 16, 4<br>
-SAD_FUNC 16, 8<br>
-SAD_FUNC 16, 12<br>
-SAD_FUNC 16, 16<br>
-SAD_FUNC 16, 32<br>
-SAD_FUNC 16, 64<br>
<br>
+SAD_FUNC_LOOP 16, 4<br>
+SAD_FUNC_LOOP 16, 8<br>
+SAD_FUNC_LOOP 16, 12<br>
+SAD_FUNC_LOOP 16, 16<br>
+SAD_FUNC_LOOP 16, 32<br>
+SAD_FUNC_LOOP 16, 64<br>
SAD_FUNC_LOOP 32, 8<br>
SAD_FUNC_LOOP 32, 16<br>
SAD_FUNC_LOOP 32, 24<br>
-- <br>
2.42.1<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div>