[x265] [PATCH v2 1/8] AArch64: Optimise Neon assembly implementations of SAD

Karam Singh karam.singh at multicorewareinc.com
Fri Aug 16 08:09:48 UTC 2024


All the patches of this series have been pushed to the master branch.
*__________________________*
*Karam Singh*
*Ph.D. IIT Guwahati*
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089


On Tue, Jul 30, 2024 at 9:14 PM Hari Limaye <hari.limaye at arm.com> wrote:

> Optimise the Neon assembly implementations of SAD primitives, replacing
> UABAL, UABAL2 sequences with UABD, UADALP which has twice the throughput
> on modern Arm cores.
>
> Also refactor the load instructions for block sizes of width 4 to use
> LDR for the first partial load of a vector register - making the
> operation completely destructive.
>
> As this patch refactors some of the block sizes (16xh) to use the
> LOOP macro (rather than the fully unrolled macro), the SVE2
> implementations which make use of these Neon macros are updated as
> required.
> ---
>  source/common/aarch64/sad-a-common.S | 172 +++++++++++++--------------
>  source/common/aarch64/sad-a-sve2.S   |  25 ++--
>  source/common/aarch64/sad-a.S        |  23 ++--
>  3 files changed, 107 insertions(+), 113 deletions(-)
>
> diff --git a/source/common/aarch64/sad-a-common.S
> b/source/common/aarch64/sad-a-common.S
> index 572484a06..f7ce264a1 100644
> --- a/source/common/aarch64/sad-a-common.S
> +++ b/source/common/aarch64/sad-a-common.S
> @@ -1,7 +1,8 @@
>
>  /*****************************************************************************
> - * Copyright (C) 2022-2023 MulticoreWare, Inc
> + * Copyright (C) 2022-2024 MulticoreWare, Inc
>   *
>   * Authors: David Chen <david.chen at myais.com.cn>
> +            Hari Limaye <hari.limaye at arm.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -37,9 +38,11 @@
>  .align 4
>
>  .macro SAD_START_4 f
> -    ld1             {v0.s}[0], [x0], x1
> +    ldr             s0, [x0]
> +    ldr             s1, [x2]
> +    add             x0, x0, x1
> +    add             x2, x2, x3
>      ld1             {v0.s}[1], [x0], x1
> -    ld1             {v1.s}[0], [x2], x3
>      ld1             {v1.s}[1], [x2], x3
>      \f              v16.8h, v0.8b, v1.8b
>  .endm
> @@ -53,33 +56,42 @@
>  .macro SAD_START_8 f
>      ld1             {v0.8b}, [x0], x1
>      ld1             {v1.8b}, [x2], x3
> -    ld1             {v2.8b}, [x0], x1
> -    ld1             {v3.8b}, [x2], x3
>      \f              v16.8h, v0.8b, v1.8b
> -    \f              v17.8h, v2.8b, v3.8b
>  .endm
>
>  .macro SAD_8 h
> -.rept \h / 2 - 1
> +.rept \h - 3
>      SAD_START_8 uabal
>  .endr
> +    ldr             d0, [x0]
> +    ldr             d1, [x2]
> +    uabal           v16.8h, v0.8b, v1.8b
> +    ldr             d0, [x0, x1]
> +    ldr             d1, [x2, x3]
> +    uabal           v16.8h, v0.8b, v1.8b
> +.endm
> +
> +.macro SAD_START_16
> +    movi            v16.16b, #0
> +    movi            v17.16b, #0
>  .endm
>
> -.macro SAD_START_16 f
> +.macro SAD_16
>      ld1             {v0.16b}, [x0], x1
>      ld1             {v1.16b}, [x2], x3
>      ld1             {v2.16b}, [x0], x1
>      ld1             {v3.16b}, [x2], x3
> -    \f              v16.8h, v0.8b, v1.8b
> -    \f\()2          v17.8h, v0.16b, v1.16b
> -    uabal           v16.8h, v2.8b, v3.8b
> -    uabal2          v17.8h, v2.16b, v3.16b
> +    uabd            v20.16b, v0.16b, v1.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v2.16b, v3.16b
> +    uadalp          v17.8h, v21.16b
>  .endm
>
> -.macro SAD_16 h
> -.rept \h / 2 - 1
> -    SAD_START_16 uabal
> -.endr
> +.macro SAD_END_16
> +    add             v16.8h, v16.8h, v17.8h
> +    uaddlv          s0, v16.8h
> +    fmov            x0, d0
> +    ret
>  .endm
>
>  .macro SAD_START_32
> @@ -94,14 +106,14 @@
>      ld1             {v2.16b-v3.16b}, [x2], x3
>      ld1             {v4.16b-v5.16b}, [x0], x1
>      ld1             {v6.16b-v7.16b}, [x2], x3
> -    uabal           v16.8h, v0.8b, v2.8b
> -    uabal2          v17.8h, v0.16b, v2.16b
> -    uabal           v18.8h, v1.8b, v3.8b
> -    uabal2          v19.8h, v1.16b, v3.16b
> -    uabal           v16.8h, v4.8b, v6.8b
> -    uabal2          v17.8h, v4.16b, v6.16b
> -    uabal           v18.8h, v5.8b, v7.8b
> -    uabal2          v19.8h, v5.16b, v7.16b
> +    uabd            v20.16b, v0.16b, v2.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v1.16b, v3.16b
> +    uadalp          v17.8h, v21.16b
> +    uabd            v22.16b, v4.16b, v6.16b
> +    uadalp          v18.8h, v22.16b
> +    uabd            v23.16b, v5.16b, v7.16b
> +    uadalp          v19.8h, v23.16b
>  .endm
>
>  .macro SAD_END_32
> @@ -118,10 +130,6 @@
>      movi            v17.16b, #0
>      movi            v18.16b, #0
>      movi            v19.16b, #0
> -    movi            v20.16b, #0
> -    movi            v21.16b, #0
> -    movi            v22.16b, #0
> -    movi            v23.16b, #0
>  .endm
>
>  .macro SAD_64
> @@ -129,35 +137,29 @@
>      ld1             {v4.16b-v7.16b}, [x2], x3
>      ld1             {v24.16b-v27.16b}, [x0], x1
>      ld1             {v28.16b-v31.16b}, [x2], x3
> -    uabal           v16.8h, v0.8b, v4.8b
> -    uabal2          v17.8h, v0.16b, v4.16b
> -    uabal           v18.8h, v1.8b, v5.8b
> -    uabal2          v19.8h, v1.16b, v5.16b
> -    uabal           v20.8h, v2.8b, v6.8b
> -    uabal2          v21.8h, v2.16b, v6.16b
> -    uabal           v22.8h, v3.8b, v7.8b
> -    uabal2          v23.8h, v3.16b, v7.16b
> -
> -    uabal           v16.8h, v24.8b, v28.8b
> -    uabal2          v17.8h, v24.16b, v28.16b
> -    uabal           v18.8h, v25.8b, v29.8b
> -    uabal2          v19.8h, v25.16b, v29.16b
> -    uabal           v20.8h, v26.8b, v30.8b
> -    uabal2          v21.8h, v26.16b, v30.16b
> -    uabal           v22.8h, v27.8b, v31.8b
> -    uabal2          v23.8h, v27.16b, v31.16b
> +    uabd            v20.16b, v0.16b, v4.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v1.16b, v5.16b
> +    uadalp          v17.8h, v21.16b
> +    uabd            v22.16b, v2.16b, v6.16b
> +    uadalp          v18.8h, v22.16b
> +    uabd            v23.16b, v3.16b, v7.16b
> +    uadalp          v19.8h, v23.16b
> +    uabd            v20.16b, v24.16b, v28.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v25.16b, v29.16b
> +    uadalp          v17.8h, v21.16b
> +    uabd            v22.16b, v26.16b, v30.16b
> +    uadalp          v18.8h, v22.16b
> +    uabd            v23.16b, v27.16b, v31.16b
> +    uadalp          v19.8h, v23.16b
>  .endm
>
>  .macro SAD_END_64
> -    add             v16.8h, v16.8h, v17.8h
> -    add             v17.8h, v18.8h, v19.8h
> -    add             v16.8h, v16.8h, v17.8h
>      uaddlp          v16.4s, v16.8h
> -    add             v18.8h, v20.8h, v21.8h
> -    add             v19.8h, v22.8h, v23.8h
> -    add             v17.8h, v18.8h, v19.8h
> -    uaddlp          v17.4s, v17.8h
> -    add             v16.4s, v16.4s, v17.4s
> +    uadalp          v16.4s, v17.8h
> +    uadalp          v16.4s, v18.8h
> +    uadalp          v16.4s, v19.8h
>      uaddlv          d0, v16.4s
>      fmov            x0, d0
>      ret
> @@ -179,10 +181,10 @@
>      and             v2.16b, v2.16b, v31.16b
>      ld1             {v3.16b}, [x2], x3
>      and             v3.16b, v3.16b, v31.16b
> -    uabal           v16.8h, v0.8b, v1.8b
> -    uabal2          v17.8h, v0.16b, v1.16b
> -    uabal           v16.8h, v2.8b, v3.8b
> -    uabal2          v17.8h, v2.16b, v3.16b
> +    uabd            v20.16b, v0.16b, v1.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v2.16b, v3.16b
> +    uadalp          v17.8h, v21.16b
>  .endm
>
>  .macro SAD_END_12
> @@ -195,7 +197,6 @@
>  .macro SAD_START_24
>      movi            v16.16b, #0
>      movi            v17.16b, #0
> -    movi            v18.16b, #0
>      sub             x1, x1, #16
>      sub             x3, x3, #16
>  .endm
> @@ -209,17 +210,16 @@
>      ld1             {v5.8b}, [x0], x1
>      ld1             {v6.16b}, [x2], #16
>      ld1             {v7.8b}, [x2], x3
> -    uabal           v16.8h, v0.8b, v2.8b
> -    uabal2          v17.8h, v0.16b, v2.16b
> -    uabal           v18.8h, v1.8b, v3.8b
> -    uabal           v16.8h, v4.8b, v6.8b
> -    uabal2          v17.8h, v4.16b, v6.16b
> -    uabal           v18.8h, v5.8b, v7.8b
> +    uabd            v20.16b, v0.16b, v2.16b
> +    uadalp          v16.8h, v20.16b
> +    uabal           v17.8h, v1.8b, v3.8b
> +    uabd            v20.16b, v4.16b, v6.16b
> +    uadalp          v16.8h, v20.16b
> +    uabal           v17.8h, v5.8b, v7.8b
>  .endm
>
>  .macro SAD_END_24
>      add             v16.8h, v16.8h, v17.8h
> -    add             v16.8h, v16.8h, v18.8h
>      uaddlv          s0, v16.8h
>      fmov            w0, s0
>      ret
> @@ -229,9 +229,6 @@
>      movi            v16.16b, #0
>      movi            v17.16b, #0
>      movi            v18.16b, #0
> -    movi            v19.16b, #0
> -    movi            v20.16b, #0
> -    movi            v21.16b, #0
>  .endm
>
>  .macro SAD_48
> @@ -239,31 +236,26 @@
>      ld1             {v4.16b-v6.16b}, [x2], x3
>      ld1             {v24.16b-v26.16b}, [x0], x1
>      ld1             {v28.16b-v30.16b}, [x2], x3
> -    uabal           v16.8h, v0.8b, v4.8b
> -    uabal2          v17.8h, v0.16b, v4.16b
> -    uabal           v18.8h, v1.8b, v5.8b
> -    uabal2          v19.8h, v1.16b, v5.16b
> -    uabal           v20.8h, v2.8b, v6.8b
> -    uabal2          v21.8h, v2.16b, v6.16b
> -
> -    uabal           v16.8h, v24.8b, v28.8b
> -    uabal2          v17.8h, v24.16b, v28.16b
> -    uabal           v18.8h, v25.8b, v29.8b
> -    uabal2          v19.8h, v25.16b, v29.16b
> -    uabal           v20.8h, v26.8b, v30.8b
> -    uabal2          v21.8h, v26.16b, v30.16b
> +    uabd            v20.16b, v0.16b, v4.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v1.16b, v5.16b
> +    uadalp          v17.8h, v21.16b
> +    uabd            v22.16b, v2.16b, v6.16b
> +    uadalp          v18.8h, v22.16b
> +    uabd            v20.16b, v24.16b, v28.16b
> +    uadalp          v16.8h, v20.16b
> +    uabd            v21.16b, v25.16b, v29.16b
> +    uadalp          v17.8h, v21.16b
> +    uabd            v22.16b, v26.16b, v30.16b
> +    uadalp          v18.8h, v22.16b
>  .endm
>
>  .macro SAD_END_48
> -    add             v16.8h, v16.8h, v17.8h
> -    add             v17.8h, v18.8h, v19.8h
> -    add             v16.8h, v16.8h, v17.8h
> -    uaddlv          s0, v16.8h
> -    fmov            w0, s0
> -    add             v18.8h, v20.8h, v21.8h
> -    uaddlv          s1, v18.8h
> -    fmov            w1, s1
> -    add             w0, w0, w1
> +    uaddlp          v16.4s, v16.8h
> +    uadalp          v16.4s, v17.8h
> +    uadalp          v16.4s, v18.8h
> +    uaddlv          d0, v16.4s
> +    fmov            x0, d0
>      ret
>  .endm
>
> diff --git a/source/common/aarch64/sad-a-sve2.S
> b/source/common/aarch64/sad-a-sve2.S
> index 599a3719a..325dc3f68 100644
> --- a/source/common/aarch64/sad-a-sve2.S
> +++ b/source/common/aarch64/sad-a-sve2.S
> @@ -1,7 +1,8 @@
>
>  /*****************************************************************************
> - * Copyright (C) 2022-2023 MulticoreWare, Inc
> + * Copyright (C) 2022-2024 MulticoreWare, Inc
>   *
>   * Authors: David Chen <david.chen at myais.com.cn>
> +            Hari Limaye <hari.limaye at arm.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -186,7 +187,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
>      bgt             .vl_gt_16_pixel_sad_\w\()x\h
>      SAD_START_\w uabdl
>      SAD_\w \h
> -.if \w > 4
> +.if \w > 8
>      add             v16.8h, v16.8h, v17.8h
>  .endif
>      uaddlv          s0, v16.8h
> @@ -196,7 +197,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
>  .if \w == 4 || \w == 8 || \w == 12
>      SAD_START_\w uabdl
>      SAD_\w \h
> -.if \w > 4
> +.if \w > 8
>      add             v16.8h, v16.8h, v17.8h
>  .endif
>      uaddlv          s0, v16.8h
> @@ -208,7 +209,7 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
>  endfunc
>  .endm
>
> -// Loop unrolled 4.
> +// Loop unrolled to process 4 rows per iteration.
>  .macro SAD_FUNC_LOOP_SVE2 w, h
>  function PFX(pixel_sad_\w\()x\h\()_sve2)
>      rdvl            x9, #1
> @@ -216,10 +217,10 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
>      bgt             .vl_gt_16_pixel_sad_loop_\w\()x\h
>      SAD_START_\w
>
> -    mov             w9, #\h/8
> +    mov             w9, #\h/4
>  .Loop_sve2_\w\()x\h:
>      sub             w9, w9, #1
> -.rept 4
> +.rept 2
>      SAD_\w
>  .endr
>      cbnz            w9, .Loop_sve2_\w\()x\h
> @@ -252,13 +253,13 @@ SAD_FUNC_SVE2  8,  4
>  SAD_FUNC_SVE2  8,  8
>  SAD_FUNC_SVE2  8,  16
>  SAD_FUNC_SVE2  8,  32
> -SAD_FUNC_SVE2  16, 4
> -SAD_FUNC_SVE2  16, 8
> -SAD_FUNC_SVE2  16, 12
> -SAD_FUNC_SVE2  16, 16
> -SAD_FUNC_SVE2  16, 32
> -SAD_FUNC_SVE2  16, 64
>
> +SAD_FUNC_LOOP_SVE2  16, 4
> +SAD_FUNC_LOOP_SVE2  16, 8
> +SAD_FUNC_LOOP_SVE2  16, 12
> +SAD_FUNC_LOOP_SVE2  16, 16
> +SAD_FUNC_LOOP_SVE2  16, 32
> +SAD_FUNC_LOOP_SVE2  16, 64
>  SAD_FUNC_LOOP_SVE2  32, 8
>  SAD_FUNC_LOOP_SVE2  32, 16
>  SAD_FUNC_LOOP_SVE2  32, 24
> diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
> index 7460825f1..b4b8e4cd9 100644
> --- a/source/common/aarch64/sad-a.S
> +++ b/source/common/aarch64/sad-a.S
> @@ -1,8 +1,9 @@
>
>  /*****************************************************************************
> - * Copyright (C) 2020-2021 MulticoreWare, Inc
> + * Copyright (C) 2020-2024 MulticoreWare, Inc
>   *
>   * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
>   *          Sebastian Pop <spop at amazon.com>
> +            Hari Limaye <hari.limaye at arm.com>
>   *
>   * This program is free software; you can redistribute it and/or modify
>   * it under the terms of the GNU General Public License as published by
> @@ -40,7 +41,7 @@
>  function PFX(pixel_sad_\w\()x\h\()_neon)
>      SAD_START_\w uabdl
>      SAD_\w \h
> -.if \w > 4
> +.if \w > 8
>      add             v16.8h, v16.8h, v17.8h
>  .endif
>      uaddlv          s0, v16.8h
> @@ -49,15 +50,15 @@ function PFX(pixel_sad_\w\()x\h\()_neon)
>  endfunc
>  .endm
>
> -// Loop unrolled 4.
> +// Loop unrolled to process 4 rows per iteration.
>  .macro SAD_FUNC_LOOP w, h
>  function PFX(pixel_sad_\w\()x\h\()_neon)
>      SAD_START_\w
>
> -    mov             w9, #\h/8
> +    mov             w9, #\h/4
>  .Loop_\w\()x\h:
>      sub             w9, w9, #1
> -.rept 4
> +.rept 2
>      SAD_\w
>  .endr
>      cbnz            w9, .Loop_\w\()x\h
> @@ -73,13 +74,13 @@ SAD_FUNC  8,  4
>  SAD_FUNC  8,  8
>  SAD_FUNC  8,  16
>  SAD_FUNC  8,  32
> -SAD_FUNC  16, 4
> -SAD_FUNC  16, 8
> -SAD_FUNC  16, 12
> -SAD_FUNC  16, 16
> -SAD_FUNC  16, 32
> -SAD_FUNC  16, 64
>
> +SAD_FUNC_LOOP  16, 4
> +SAD_FUNC_LOOP  16, 8
> +SAD_FUNC_LOOP  16, 12
> +SAD_FUNC_LOOP  16, 16
> +SAD_FUNC_LOOP  16, 32
> +SAD_FUNC_LOOP  16, 64
>  SAD_FUNC_LOOP  32, 8
>  SAD_FUNC_LOOP  32, 16
>  SAD_FUNC_LOOP  32, 24
> --
> 2.42.1
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240816/2af4bd20/attachment-0001.htm>


More information about the x265-devel mailing list