[x265] [PATCH 2/8] AArch64: Optimise Neon assembly implementations of SADxN
Hari Limaye
hari.limaye at arm.com
Thu May 23 17:17:58 UTC 2024
Optimise the Neon assembly implementations of SADx3 and SADx4
primitives, replacing UABAL, UABAL2 sequences with UABD, UADALP which
has twice the throughput on modern Arm cores.
Also refactor the load instructions for block sizes of width 4 to use
LDR for the first partial load of a vector register - making the
operation completely destructive.
As this patch refactors some of the block sizes (12xh, 16xh) to use the
LOOP macro (rather than the fully unrolled macro), the SVE2
implementations which make use of these Neon macros are updated as
required.
---
source/common/aarch64/sad-a-common.S | 229 ++++++++++++---------------
source/common/aarch64/sad-a-sve2.S | 35 ++--
source/common/aarch64/sad-a.S | 46 +++---
3 files changed, 147 insertions(+), 163 deletions(-)
diff --git a/source/common/aarch64/sad-a-common.S b/source/common/aarch64/sad-a-common.S
index d8644f694..a94607369 100644
--- a/source/common/aarch64/sad-a-common.S
+++ b/source/common/aarch64/sad-a-common.S
@@ -257,19 +257,24 @@
.endm
.macro SAD_X_START_4 h, x, f
- ld1 {v0.s}[0], [x0], x9
+ ldr s0, [x0]
+ ldr s1, [x1]
+ ldr s2, [x2]
+ ldr s3, [x3]
+ add x0, x0, x9
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
ld1 {v0.s}[1], [x0], x9
- ld1 {v1.s}[0], [x1], x5
ld1 {v1.s}[1], [x1], x5
- ld1 {v2.s}[0], [x2], x5
ld1 {v2.s}[1], [x2], x5
- ld1 {v3.s}[0], [x3], x5
ld1 {v3.s}[1], [x3], x5
\f v16.8h, v0.8b, v1.8b
\f v17.8h, v0.8b, v2.8b
\f v18.8h, v0.8b, v3.8b
.if \x == 4
- ld1 {v4.s}[0], [x4], x5
+ ldr s4, [x4]
+ add x4, x4, x5
ld1 {v4.s}[1], [x4], x5
\f v19.8h, v0.8b, v4.8b
.endif
@@ -319,76 +324,83 @@
SAD_X_END_4 \x
.endm
-.macro SAD_X_START_12 h, x, f
- ld1 {v0.16b}, [x0], x9
- and v0.16b, v0.16b, v31.16b
- ld1 {v1.16b}, [x1], x5
- and v1.16b, v1.16b, v31.16b
- ld1 {v2.16b}, [x2], x5
- and v2.16b, v2.16b, v31.16b
- ld1 {v3.16b}, [x3], x5
- and v3.16b, v3.16b, v31.16b
- \f v16.8h, v1.8b, v0.8b
- \f\()2 v20.8h, v1.16b, v0.16b
- \f v17.8h, v2.8b, v0.8b
- \f\()2 v21.8h, v2.16b, v0.16b
- \f v18.8h, v3.8b, v0.8b
- \f\()2 v22.8h, v3.16b, v0.16b
-.if \x == 4
- ld1 {v4.16b}, [x4], x5
- and v4.16b, v4.16b, v31.16b
- \f v19.8h, v4.8b, v0.8b
- \f\()2 v23.8h, v4.16b, v0.16b
-.endif
+.macro SAD_X_START_12 x
+ SAD_X_START_16 \x
.endm
-.macro SAD_X_12 h x
-.rept \h - 1
- SAD_X_START_12 \h, \x, uabal
-.endr
+.macro SAD_X_12 base v1 v2
+ // v2: unused
+ // v31: bitmask for 12xh blocks
+ ld1 {v0.16b}, [ \base ], x5
+ and v0.16b, v0.16b, v31.16b
+
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
.endm
.macro SAD_X_END_12 x
- SAD_X_END_16 \x
-.endm
-
-.macro SAD_X_START_16 h, x, f
- ld1 {v0.16b}, [x0], x9
- ld1 {v1.16b}, [x1], x5
- ld1 {v2.16b}, [x2], x5
- ld1 {v3.16b}, [x3], x5
- \f v16.8h, v1.8b, v0.8b
- \f\()2 v20.8h, v1.16b, v0.16b
- \f v17.8h, v2.8b, v0.8b
- \f\()2 v21.8h, v2.16b, v0.16b
- \f v18.8h, v3.8b, v0.8b
- \f\()2 v22.8h, v3.16b, v0.16b
+ SAD_X_END_4 \x
+.endm
+
+.macro SAD_X_START_16 x
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
.if \x == 4
- ld1 {v4.16b}, [x4], x5
- \f v19.8h, v4.8b, v0.8b
- \f\()2 v23.8h, v4.16b, v0.16b
+ movi v19.16b, #0
.endif
.endm
-.macro SAD_X_16 h x
-.rept \h - 1
- SAD_X_START_16 \h, \x, uabal
-.endr
+.macro SAD_X_16 base v1 v2
+ // v2: unused
+ ld1 {v0.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
.endm
.macro SAD_X_END_16 x
- add v16.8h, v16.8h, v20.8h
- add v17.8h, v17.8h, v21.8h
- add v18.8h, v18.8h, v22.8h
+ SAD_X_END_4 \x
+.endm
+
+.macro SAD_X_START_LARGE x
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
.if \x == 4
- add v19.8h, v19.8h, v23.8h
+ movi v19.16b, #0
+ movi v23.16b, #0
.endif
+.endm
- SAD_X_END_4 \x
+.macro SAD_X_END_LARGE x
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v20.8h
+ uaddlp v17.4s, v17.8h
+ uadalp v17.4s, v21.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v22.8h
+.if \x == 3
+ addv s0, v16.4s
+ addv s1, v17.4s
+ addv s2, v18.4s
+ stp s0, s1, [x6], #8
+ str s2, [x6]
+.elseif \x == 4
+ uaddlp v19.4s, v19.8h
+ uadalp v19.4s, v23.8h
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
+ ret
.endm
.macro SAD_X_START_24 x
- SAD_X_START_32 \x
+ SAD_X_START_LARGE \x
sub x5, x5, #16
sub x9, x9, #16
.endm
@@ -396,106 +408,67 @@
.macro SAD_X_24 base v1 v2
ld1 {v0.16b}, [ \base ], #16
ld1 {v1.8b}, [ \base ], x5
- uabal \v1\().8h, v0.8b, v6.8b
- uabal \v1\().8h, v1.8b, v7.8b
- uabal2 \v2\().8h, v0.16b, v6.16b
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+ uabal \v2\().8h, v1.8b, v7.8b
.endm
.macro SAD_X_END_24 x
- SAD_X_END_16 \x
+ SAD_X_END_LARGE \x
.endm
.macro SAD_X_START_32 x
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v20.16b, #0
- movi v21.16b, #0
- movi v22.16b, #0
-.if \x == 4
- movi v19.16b, #0
- movi v23.16b, #0
-.endif
+ SAD_X_START_LARGE \x
.endm
.macro SAD_X_32 base v1 v2
ld1 {v0.16b-v1.16b}, [ \base ], x5
- uabal \v1\().8h, v0.8b, v6.8b
- uabal \v1\().8h, v1.8b, v7.8b
- uabal2 \v2\().8h, v0.16b, v6.16b
- uabal2 \v2\().8h, v1.16b, v7.16b
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v7.16b
+ uadalp \v2\().8h, v25.16b
.endm
.macro SAD_X_END_32 x
- SAD_X_END_16 \x
+ SAD_X_END_LARGE \x
.endm
.macro SAD_X_START_48 x
- SAD_X_START_32 \x
+ SAD_X_START_LARGE \x
.endm
-.macro SAD_X_48 x1 v1 v2
- ld1 {v0.16b-v2.16b}, [ \x1 ], x5
- uabal \v1\().8h, v0.8b, v4.8b
- uabal \v1\().8h, v1.8b, v5.8b
- uabal \v1\().8h, v2.8b, v6.8b
- uabal2 \v2\().8h, v0.16b, v4.16b
- uabal2 \v2\().8h, v1.16b, v5.16b
- uabal2 \v2\().8h, v2.16b, v6.16b
+.macro SAD_X_48 base v1 v2
+ ld1 {v0.16b-v2.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v5.16b
+ uadalp \v2\().8h, v25.16b
+ uabd v26.16b, v2.16b, v6.16b
+ uadalp \v1\().8h, v26.16b
.endm
.macro SAD_X_END_48 x
- SAD_X_END_64 \x
+ SAD_X_END_LARGE \x
.endm
.macro SAD_X_START_64 x
- SAD_X_START_32 \x
+ SAD_X_START_LARGE \x
.endm
-.macro SAD_X_64 x1 v1 v2
- ld1 {v0.16b-v3.16b}, [ \x1 ], x5
- uabal \v1\().8h, v0.8b, v4.8b
- uabal \v1\().8h, v1.8b, v5.8b
- uabal \v1\().8h, v2.8b, v6.8b
- uabal \v1\().8h, v3.8b, v7.8b
- uabal2 \v2\().8h, v0.16b, v4.16b
- uabal2 \v2\().8h, v1.16b, v5.16b
- uabal2 \v2\().8h, v2.16b, v6.16b
- uabal2 \v2\().8h, v3.16b, v7.16b
+.macro SAD_X_64 base v1 v2
+ ld1 {v0.16b-v3.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v5.16b
+ uadalp \v2\().8h, v25.16b
+ uabd v26.16b, v2.16b, v6.16b
+ uadalp \v1\().8h, v26.16b
+ uabd v27.16b, v3.16b, v7.16b
+ uadalp \v2\().8h, v27.16b
.endm
.macro SAD_X_END_64 x
- uaddlp v16.4s, v16.8h
- uaddlp v17.4s, v17.8h
- uaddlp v18.4s, v18.8h
- uaddlp v20.4s, v20.8h
- uaddlp v21.4s, v21.8h
- uaddlp v22.4s, v22.8h
- add v16.4s, v16.4s, v20.4s
- add v17.4s, v17.4s, v21.4s
- add v18.4s, v18.4s, v22.4s
- trn2 v20.2d, v16.2d, v16.2d
- trn2 v21.2d, v17.2d, v17.2d
- trn2 v22.2d, v18.2d, v18.2d
- add v16.2s, v16.2s, v20.2s
- add v17.2s, v17.2s, v21.2s
- add v18.2s, v18.2s, v22.2s
- uaddlp v16.1d, v16.2s
- uaddlp v17.1d, v17.2s
- uaddlp v18.1d, v18.2s
- stp s16, s17, [x6], #8
-.if \x == 3
- str s18, [x6]
-.elseif \x == 4
- uaddlp v19.4s, v19.8h
- uaddlp v23.4s, v23.8h
- add v19.4s, v19.4s, v23.4s
- trn2 v23.2d, v19.2d, v19.2d
- add v19.2s, v19.2s, v23.2s
- uaddlp v19.1d, v19.2s
- stp s18, s19, [x6]
-.endif
- ret
+ SAD_X_END_LARGE \x
.endm
const sad12_mask, align=8
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S
index f98ecb137..72432055d 100644
--- a/source/common/aarch64/sad-a-sve2.S
+++ b/source/common/aarch64/sad-a-sve2.S
@@ -406,7 +406,12 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
.Loop_sad_sve2_x\x\()_\w\()x\h:
sub w12, w12, #1
.rept 4
- .if \w == 24
+ .if \w == 12
+ ld1 {v6.16b}, [x0], x9
+ and v6.16b, v6.16b, v31.16b
+ .elseif \w == 16
+ ld1 {v6.16b}, [x0], x9
+ .elseif \w == 24
ld1 {v6.16b}, [x0], #16
ld1 {v7.8b}, [x0], x9
.elseif \w == 32
@@ -466,13 +471,13 @@ SAD_X_FUNC_SVE2 3, 8, 4
SAD_X_FUNC_SVE2 3, 8, 8
SAD_X_FUNC_SVE2 3, 8, 16
SAD_X_FUNC_SVE2 3, 8, 32
-SAD_X_FUNC_SVE2 3, 12, 16
-SAD_X_FUNC_SVE2 3, 16, 4
-SAD_X_FUNC_SVE2 3, 16, 8
-SAD_X_FUNC_SVE2 3, 16, 12
-SAD_X_FUNC_SVE2 3, 16, 16
-SAD_X_FUNC_SVE2 3, 16, 32
-SAD_X_FUNC_SVE2 3, 16, 64
+SAD_X_LOOP_SVE2 3, 12, 16
+SAD_X_LOOP_SVE2 3, 16, 4
+SAD_X_LOOP_SVE2 3, 16, 8
+SAD_X_LOOP_SVE2 3, 16, 12
+SAD_X_LOOP_SVE2 3, 16, 16
+SAD_X_LOOP_SVE2 3, 16, 32
+SAD_X_LOOP_SVE2 3, 16, 64
SAD_X_LOOP_SVE2 3, 24, 32
SAD_X_LOOP_SVE2 3, 32, 8
SAD_X_LOOP_SVE2 3, 32, 16
@@ -492,13 +497,13 @@ SAD_X_FUNC_SVE2 4, 8, 4
SAD_X_FUNC_SVE2 4, 8, 8
SAD_X_FUNC_SVE2 4, 8, 16
SAD_X_FUNC_SVE2 4, 8, 32
-SAD_X_FUNC_SVE2 4, 12, 16
-SAD_X_FUNC_SVE2 4, 16, 4
-SAD_X_FUNC_SVE2 4, 16, 8
-SAD_X_FUNC_SVE2 4, 16, 12
-SAD_X_FUNC_SVE2 4, 16, 16
-SAD_X_FUNC_SVE2 4, 16, 32
-SAD_X_FUNC_SVE2 4, 16, 64
+SAD_X_LOOP_SVE2 4, 12, 16
+SAD_X_LOOP_SVE2 4, 16, 4
+SAD_X_LOOP_SVE2 4, 16, 8
+SAD_X_LOOP_SVE2 4, 16, 12
+SAD_X_LOOP_SVE2 4, 16, 16
+SAD_X_LOOP_SVE2 4, 16, 32
+SAD_X_LOOP_SVE2 4, 16, 64
SAD_X_LOOP_SVE2 4, 24, 32
SAD_X_LOOP_SVE2 4, 32, 8
SAD_X_LOOP_SVE2 4, 32, 16
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 2fd2c2cc8..4fef9e24c 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -108,11 +108,6 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
mov x5, x4
.endif
-.if \w == 12
- movrel x12, sad12_mask
- ld1 {v31.16b}, [x12]
-.endif
-
SAD_X_START_\w \h, \x, uabdl
SAD_X_\w \h, \x
SAD_X_END_\w \x
@@ -128,12 +123,23 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
mov x6, x5
mov x5, x4
.endif
+
+.if \w == 12
+ movrel x12, sad12_mask
+ ld1 {v31.16b}, [x12]
+.endif
+
SAD_X_START_\w \x
mov w12, #\h/4
.Loop_sad_x\x\()_\w\()x\h:
sub w12, w12, #1
.rept 4
- .if \w == 24
+ .if \w == 12
+ ld1 {v6.16b}, [x0], x9
+ and v6.16b, v6.16b, v31.16b
+ .elseif \w == 16
+ ld1 {v6.16b}, [x0], x9
+ .elseif \w == 24
ld1 {v6.16b}, [x0], #16
ld1 {v7.8b}, [x0], x9
.elseif \w == 32
@@ -163,13 +169,13 @@ SAD_X_FUNC 3, 8, 4
SAD_X_FUNC 3, 8, 8
SAD_X_FUNC 3, 8, 16
SAD_X_FUNC 3, 8, 32
-SAD_X_FUNC 3, 12, 16
-SAD_X_FUNC 3, 16, 4
-SAD_X_FUNC 3, 16, 8
-SAD_X_FUNC 3, 16, 12
-SAD_X_FUNC 3, 16, 16
-SAD_X_FUNC 3, 16, 32
-SAD_X_FUNC 3, 16, 64
+SAD_X_LOOP 3, 12, 16
+SAD_X_LOOP 3, 16, 4
+SAD_X_LOOP 3, 16, 8
+SAD_X_LOOP 3, 16, 12
+SAD_X_LOOP 3, 16, 16
+SAD_X_LOOP 3, 16, 32
+SAD_X_LOOP 3, 16, 64
SAD_X_LOOP 3, 24, 32
SAD_X_LOOP 3, 32, 8
SAD_X_LOOP 3, 32, 16
@@ -189,13 +195,13 @@ SAD_X_FUNC 4, 8, 4
SAD_X_FUNC 4, 8, 8
SAD_X_FUNC 4, 8, 16
SAD_X_FUNC 4, 8, 32
-SAD_X_FUNC 4, 12, 16
-SAD_X_FUNC 4, 16, 4
-SAD_X_FUNC 4, 16, 8
-SAD_X_FUNC 4, 16, 12
-SAD_X_FUNC 4, 16, 16
-SAD_X_FUNC 4, 16, 32
-SAD_X_FUNC 4, 16, 64
+SAD_X_LOOP 4, 12, 16
+SAD_X_LOOP 4, 16, 4
+SAD_X_LOOP 4, 16, 8
+SAD_X_LOOP 4, 16, 12
+SAD_X_LOOP 4, 16, 16
+SAD_X_LOOP 4, 16, 32
+SAD_X_LOOP 4, 16, 64
SAD_X_LOOP 4, 24, 32
SAD_X_LOOP 4, 32, 8
SAD_X_LOOP 4, 32, 16
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Optimise-Neon-assembly-implementations-of-SA.patch
Type: text/x-patch
Size: 14299 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240523/4d705d55/attachment.bin>
More information about the x265-devel
mailing list