[x265] [PATCH v2 1/8] AArch64: Optimise Neon assembly implementations of SAD
Karam Singh
karam.singh at multicorewareinc.com
Fri Aug 16 08:09:48 UTC 2024
All the patches of this series have been pushed to the master branch.
*__________________________*
*Karam Singh*
*Ph.D. IIT Guwahati*
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089
On Tue, Jul 30, 2024 at 9:14 PM Hari Limaye <hari.limaye at arm.com> wrote:
> Optimise the Neon assembly implementations of SAD primitives, replacing
> UABAL, UABAL2 sequences with UABD, UADALP which has twice the throughput
> on modern Arm cores.
>
> Also refactor the load instructions for block sizes of width 4 to use
> LDR for the first partial load of a vector register - making the
> operation completely destructive.
>
> As this patch refactors some of the block sizes (16xh) to use the
> LOOP macro (rather than the fully unrolled macro), the SVE2
> implementations which make use of these Neon macros are updated as
> required.
> ---
> source/common/aarch64/sad-a-common.S | 172 +++++++++++++--------------
> source/common/aarch64/sad-a-sve2.S | 25 ++--
> source/common/aarch64/sad-a.S | 23 ++--
> 3 files changed, 107 insertions(+), 113 deletions(-)
>
> diff --git a/source/common/aarch64/sad-a-common.S
> b/source/common/aarch64/sad-a-common.S
> index 572484a06..f7ce264a1 100644
> --- a/source/common/aarch64/sad-a-common.S
> +++ b/source/common/aarch64/sad-a-common.S
> @@ -1,7 +1,8 @@
>
> /*****************************************************************************
> - * Copyright (C) 2022-2023 MulticoreWare, Inc
> + * Copyright (C) 2022-2024 MulticoreWare, Inc
> *
> * Authors: David Chen <david.chen at myais.com.cn>
> + Hari Limaye <hari.limaye at arm.com>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -37,9 +38,11 @@
> .align 4
>
> .macro SAD_START_4 f
> - ld1 {v0.s}[0], [x0], x1
> + ldr s0, [x0]
> + ldr s1, [x2]
> + add x0, x0, x1
> + add x2, x2, x3
> ld1 {v0.s}[1], [x0], x1
> - ld1 {v1.s}[0], [x2], x3
> ld1 {v1.s}[1], [x2], x3
> \f v16.8h, v0.8b, v1.8b
> .endm
> @@ -53,33 +56,42 @@
> .macro SAD_START_8 f
> ld1 {v0.8b}, [x0], x1
> ld1 {v1.8b}, [x2], x3
> - ld1 {v2.8b}, [x0], x1
> - ld1 {v3.8b}, [x2], x3
> \f v16.8h, v0.8b, v1.8b
> - \f v17.8h, v2.8b, v3.8b
> .endm
>
> .macro SAD_8 h
> -.rept \h / 2 - 1
> +.rept \h - 3
> SAD_START_8 uabal
> .endr
> + ldr d0, [x0]
> + ldr d1, [x2]
> + uabal v16.8h, v0.8b, v1.8b
> + ldr d0, [x0, x1]
> + ldr d1, [x2, x3]
> + uabal v16.8h, v0.8b, v1.8b
> +.endm
> +
> +.macro SAD_START_16
> + movi v16.16b, #0
> + movi v17.16b, #0
> .endm
>
> -.macro SAD_START_16 f
> +.macro SAD_16
> ld1 {v0.16b}, [x0], x1
> ld1 {v1.16b}, [x2], x3
> ld1 {v2.16b}, [x0], x1
> ld1 {v3.16b}, [x2], x3
> - \f v16.8h, v0.8b, v1.8b
> - \f\()2 v17.8h, v0.16b, v1.16b
> - uabal v16.8h, v2.8b, v3.8b
> - uabal2 v17.8h, v2.16b, v3.16b
> + uabd v20.16b, v0.16b, v1.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v2.16b, v3.16b
> + uadalp v17.8h, v21.16b
> .endm
>
> -.macro SAD_16 h
> -.rept \h / 2 - 1
> - SAD_START_16 uabal
> -.endr
> +.macro SAD_END_16
> + add v16.8h, v16.8h, v17.8h
> + uaddlv s0, v16.8h
> + fmov x0, d0
> + ret
> .endm
>
> .macro SAD_START_32
> @@ -94,14 +106,14 @@
> ld1 {v2.16b-v3.16b}, [x2], x3
> ld1 {v4.16b-v5.16b}, [x0], x1
> ld1 {v6.16b-v7.16b}, [x2], x3
> - uabal v16.8h, v0.8b, v2.8b
> - uabal2 v17.8h, v0.16b, v2.16b
> - uabal v18.8h, v1.8b, v3.8b
> - uabal2 v19.8h, v1.16b, v3.16b
> - uabal v16.8h, v4.8b, v6.8b
> - uabal2 v17.8h, v4.16b, v6.16b
> - uabal v18.8h, v5.8b, v7.8b
> - uabal2 v19.8h, v5.16b, v7.16b
> + uabd v20.16b, v0.16b, v2.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v1.16b, v3.16b
> + uadalp v17.8h, v21.16b
> + uabd v22.16b, v4.16b, v6.16b
> + uadalp v18.8h, v22.16b
> + uabd v23.16b, v5.16b, v7.16b
> + uadalp v19.8h, v23.16b
> .endm
>
> .macro SAD_END_32
> @@ -118,10 +130,6 @@
> movi v17.16b, #0
> movi v18.16b, #0
> movi v19.16b, #0
> - movi v20.16b, #0
> - movi v21.16b, #0
> - movi v22.16b, #0
> - movi v23.16b, #0
> .endm
>
> .macro SAD_64
> @@ -129,35 +137,29 @@
> ld1 {v4.16b-v7.16b}, [x2], x3
> ld1 {v24.16b-v27.16b}, [x0], x1
> ld1 {v28.16b-v31.16b}, [x2], x3
> - uabal v16.8h, v0.8b, v4.8b
> - uabal2 v17.8h, v0.16b, v4.16b
> - uabal v18.8h, v1.8b, v5.8b
> - uabal2 v19.8h, v1.16b, v5.16b
> - uabal v20.8h, v2.8b, v6.8b
> - uabal2 v21.8h, v2.16b, v6.16b
> - uabal v22.8h, v3.8b, v7.8b
> - uabal2 v23.8h, v3.16b, v7.16b
> -
> - uabal v16.8h, v24.8b, v28.8b
> - uabal2 v17.8h, v24.16b, v28.16b
> - uabal v18.8h, v25.8b, v29.8b
> - uabal2 v19.8h, v25.16b, v29.16b
> - uabal v20.8h, v26.8b, v30.8b
> - uabal2 v21.8h, v26.16b, v30.16b
> - uabal v22.8h, v27.8b, v31.8b
> - uabal2 v23.8h, v27.16b, v31.16b
> + uabd v20.16b, v0.16b, v4.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v1.16b, v5.16b
> + uadalp v17.8h, v21.16b
> + uabd v22.16b, v2.16b, v6.16b
> + uadalp v18.8h, v22.16b
> + uabd v23.16b, v3.16b, v7.16b
> + uadalp v19.8h, v23.16b
> + uabd v20.16b, v24.16b, v28.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v25.16b, v29.16b
> + uadalp v17.8h, v21.16b
> + uabd v22.16b, v26.16b, v30.16b
> + uadalp v18.8h, v22.16b
> + uabd v23.16b, v27.16b, v31.16b
> + uadalp v19.8h, v23.16b
> .endm
>
> .macro SAD_END_64
> - add v16.8h, v16.8h, v17.8h
> - add v17.8h, v18.8h, v19.8h
> - add v16.8h, v16.8h, v17.8h
> uaddlp v16.4s, v16.8h
> - add v18.8h, v20.8h, v21.8h
> - add v19.8h, v22.8h, v23.8h
> - add v17.8h, v18.8h, v19.8h
> - uaddlp v17.4s, v17.8h
> - add v16.4s, v16.4s, v17.4s
> + uadalp v16.4s, v17.8h
> + uadalp v16.4s, v18.8h
> + uadalp v16.4s, v19.8h
> uaddlv d0, v16.4s
> fmov x0, d0
> ret
> @@ -179,10 +181,10 @@
> and v2.16b, v2.16b, v31.16b
> ld1 {v3.16b}, [x2], x3
> and v3.16b, v3.16b, v31.16b
> - uabal v16.8h, v0.8b, v1.8b
> - uabal2 v17.8h, v0.16b, v1.16b
> - uabal v16.8h, v2.8b, v3.8b
> - uabal2 v17.8h, v2.16b, v3.16b
> + uabd v20.16b, v0.16b, v1.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v2.16b, v3.16b
> + uadalp v17.8h, v21.16b
> .endm
>
> .macro SAD_END_12
> @@ -195,7 +197,6 @@
> .macro SAD_START_24
> movi v16.16b, #0
> movi v17.16b, #0
> - movi v18.16b, #0
> sub x1, x1, #16
> sub x3, x3, #16
> .endm
> @@ -209,17 +210,16 @@
> ld1 {v5.8b}, [x0], x1
> ld1 {v6.16b}, [x2], #16
> ld1 {v7.8b}, [x2], x3
> - uabal v16.8h, v0.8b, v2.8b
> - uabal2 v17.8h, v0.16b, v2.16b
> - uabal v18.8h, v1.8b, v3.8b
> - uabal v16.8h, v4.8b, v6.8b
> - uabal2 v17.8h, v4.16b, v6.16b
> - uabal v18.8h, v5.8b, v7.8b
> + uabd v20.16b, v0.16b, v2.16b
> + uadalp v16.8h, v20.16b
> + uabal v17.8h, v1.8b, v3.8b
> + uabd v20.16b, v4.16b, v6.16b
> + uadalp v16.8h, v20.16b
> + uabal v17.8h, v5.8b, v7.8b
> .endm
>
> .macro SAD_END_24
> add v16.8h, v16.8h, v17.8h
> - add v16.8h, v16.8h, v18.8h
> uaddlv s0, v16.8h
> fmov w0, s0
> ret
> @@ -229,9 +229,6 @@
> movi v16.16b, #0
> movi v17.16b, #0
> movi v18.16b, #0
> - movi v19.16b, #0
> - movi v20.16b, #0
> - movi v21.16b, #0
> .endm
>
> .macro SAD_48
> @@ -239,31 +236,26 @@
> ld1 {v4.16b-v6.16b}, [x2], x3
> ld1 {v24.16b-v26.16b}, [x0], x1
> ld1 {v28.16b-v30.16b}, [x2], x3
> - uabal v16.8h, v0.8b, v4.8b
> - uabal2 v17.8h, v0.16b, v4.16b
> - uabal v18.8h, v1.8b, v5.8b
> - uabal2 v19.8h, v1.16b, v5.16b
> - uabal v20.8h, v2.8b, v6.8b
> - uabal2 v21.8h, v2.16b, v6.16b
> -
> - uabal v16.8h, v24.8b, v28.8b
> - uabal2 v17.8h, v24.16b, v28.16b
> - uabal v18.8h, v25.8b, v29.8b
> - uabal2 v19.8h, v25.16b, v29.16b
> - uabal v20.8h, v26.8b, v30.8b
> - uabal2 v21.8h, v26.16b, v30.16b
> + uabd v20.16b, v0.16b, v4.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v1.16b, v5.16b
> + uadalp v17.8h, v21.16b
> + uabd v22.16b, v2.16b, v6.16b
> + uadalp v18.8h, v22.16b
> + uabd v20.16b, v24.16b, v28.16b
> + uadalp v16.8h, v20.16b
> + uabd v21.16b, v25.16b, v29.16b
> + uadalp v17.8h, v21.16b
> + uabd v22.16b, v26.16b, v30.16b
> + uadalp v18.8h, v22.16b
> .endm
>
> .macro SAD_END_48
> - add v16.8h, v16.8h, v17.8h
> - add v17.8h, v18.8h, v19.8h
> - add v16.8h, v16.8h, v17.8h
> - uaddlv s0, v16.8h
> - fmov w0, s0
> - add v18.8h, v20.8h, v21.8h
> - uaddlv s1, v18.8h
> - fmov w1, s1
> - add w0, w0, w1
> + uaddlp v16.4s, v16.8h
> + uadalp v16.4s, v17.8h
> + uadalp v16.4s, v18.8h
> + uaddlv d0, v16.4s
> + fmov x0, d0
> ret
> .endm
>
> diff --git a/source/common/aarch64/sad-a-sve2.S
> b/source/common/aarch64/sad-a-sve2.S
> index 599a3719a..325dc3f68 100644
> --- a/source/common/aarch64/sad-a-sve2.S
> +++ b/source/common/aarch64/sad-a-sve2.S
> @@ -1,7 +1,8 @@
>
> /*****************************************************************************
> - * Copyright (C) 2022-2023 MulticoreWare, Inc
> + * Copyright (C) 2022-2024 MulticoreWare, Inc
> *
> * Authors: David Chen <david.chen at myais.com.cn>
> + Hari Limaye <hari.limaye at arm.com>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -186,7 +187,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
> bgt .vl_gt_16_pixel_sad_\w\()x\h
> SAD_START_\w uabdl
> SAD_\w \h
> -.if \w > 4
> +.if \w > 8
> add v16.8h, v16.8h, v17.8h
> .endif
> uaddlv s0, v16.8h
> @@ -196,7 +197,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
> .if \w == 4 || \w == 8 || \w == 12
> SAD_START_\w uabdl
> SAD_\w \h
> -.if \w > 4
> +.if \w > 8
> add v16.8h, v16.8h, v17.8h
> .endif
> uaddlv s0, v16.8h
> @@ -208,7 +209,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
> endfunc
> .endm
>
> -// Loop unrolled 4.
> +// Loop unrolled to process 4 rows per iteration.
> .macro SAD_FUNC_LOOP_SVE2 w, h
> function PFX(pixel_sad_\w\()x\h\()_sve2)
> rdvl x9, #1
> @@ -216,10 +217,10 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
> bgt .vl_gt_16_pixel_sad_loop_\w\()x\h
> SAD_START_\w
>
> - mov w9, #\h/8
> + mov w9, #\h/4
> .Loop_sve2_\w\()x\h:
> sub w9, w9, #1
> -.rept 4
> +.rept 2
> SAD_\w
> .endr
> cbnz w9, .Loop_sve2_\w\()x\h
> @@ -252,13 +253,13 @@ SAD_FUNC_SVE2 8, 4
> SAD_FUNC_SVE2 8, 8
> SAD_FUNC_SVE2 8, 16
> SAD_FUNC_SVE2 8, 32
> -SAD_FUNC_SVE2 16, 4
> -SAD_FUNC_SVE2 16, 8
> -SAD_FUNC_SVE2 16, 12
> -SAD_FUNC_SVE2 16, 16
> -SAD_FUNC_SVE2 16, 32
> -SAD_FUNC_SVE2 16, 64
>
> +SAD_FUNC_LOOP_SVE2 16, 4
> +SAD_FUNC_LOOP_SVE2 16, 8
> +SAD_FUNC_LOOP_SVE2 16, 12
> +SAD_FUNC_LOOP_SVE2 16, 16
> +SAD_FUNC_LOOP_SVE2 16, 32
> +SAD_FUNC_LOOP_SVE2 16, 64
> SAD_FUNC_LOOP_SVE2 32, 8
> SAD_FUNC_LOOP_SVE2 32, 16
> SAD_FUNC_LOOP_SVE2 32, 24
> diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
> index 7460825f1..b4b8e4cd9 100644
> --- a/source/common/aarch64/sad-a.S
> +++ b/source/common/aarch64/sad-a.S
> @@ -1,8 +1,9 @@
>
> /*****************************************************************************
> - * Copyright (C) 2020-2021 MulticoreWare, Inc
> + * Copyright (C) 2020-2024 MulticoreWare, Inc
> *
> * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
> * Sebastian Pop <spop at amazon.com>
> + Hari Limaye <hari.limaye at arm.com>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -40,7 +41,7 @@
> function PFX(pixel_sad_\w\()x\h\()_neon)
> SAD_START_\w uabdl
> SAD_\w \h
> -.if \w > 4
> +.if \w > 8
> add v16.8h, v16.8h, v17.8h
> .endif
> uaddlv s0, v16.8h
> @@ -49,15 +50,15 @@ function PFX(pixel_sad_\w\()x\h\()_neon)
> endfunc
> .endm
>
> -// Loop unrolled 4.
> +// Loop unrolled to process 4 rows per iteration.
> .macro SAD_FUNC_LOOP w, h
> function PFX(pixel_sad_\w\()x\h\()_neon)
> SAD_START_\w
>
> - mov w9, #\h/8
> + mov w9, #\h/4
> .Loop_\w\()x\h:
> sub w9, w9, #1
> -.rept 4
> +.rept 2
> SAD_\w
> .endr
> cbnz w9, .Loop_\w\()x\h
> @@ -73,13 +74,13 @@ SAD_FUNC 8, 4
> SAD_FUNC 8, 8
> SAD_FUNC 8, 16
> SAD_FUNC 8, 32
> -SAD_FUNC 16, 4
> -SAD_FUNC 16, 8
> -SAD_FUNC 16, 12
> -SAD_FUNC 16, 16
> -SAD_FUNC 16, 32
> -SAD_FUNC 16, 64
>
> +SAD_FUNC_LOOP 16, 4
> +SAD_FUNC_LOOP 16, 8
> +SAD_FUNC_LOOP 16, 12
> +SAD_FUNC_LOOP 16, 16
> +SAD_FUNC_LOOP 16, 32
> +SAD_FUNC_LOOP 16, 64
> SAD_FUNC_LOOP 32, 8
> SAD_FUNC_LOOP 32, 16
> SAD_FUNC_LOOP 32, 24
> --
> 2.42.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240816/2af4bd20/attachment-0001.htm>
More information about the x265-devel
mailing list