[x265] [PATCH 1/3] AArch64: Add Neon asm implementation of HBD SAD

Fri Nov 15 11:16:16 UTC 2024

Add a Neon asm implementation of high bitdepth SAD functions for all
block sizes. This implementation is 13-20% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_pp_neon<w,h>
implementation.
---
 source/common/aarch64/asm-primitives.cpp |   4 +-
 source/common/aarch64/sad-a.S            | 271 ++++++++++++++++++++++-
 2 files changed, 273 insertions(+), 2 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index dd3c2a4ba..4cab2d66f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -524,6 +524,9 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
     p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
 
+    // sad
+    ALL_LUMA_PU(sad, pixel_sad, neon);
+
 #if !HIGH_BIT_DEPTH
     // pixel_avg_pp
     ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -538,7 +541,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
 
     // sad
-    ALL_LUMA_PU(sad, pixel_sad, neon);
     ALL_LUMA_PU(sad_x3, sad_x3, neon);
     ALL_LUMA_PU(sad_x4, sad_x4, neon);
 
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 0feffc7a9..80c8ffdcb 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -3,7 +3,8 @@
  *
  * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
  *          Sebastian Pop <spop at amazon.com>
-            Hari Limaye <hari.limaye at arm.com>
+ *          Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 
 .text
 
+#if !HIGH_BIT_DEPTH
 .macro SAD_START_4 f
     ldr             s0, [x0]
     ldr             s1, [x2]
@@ -653,3 +655,270 @@ SAD_X_LOOP  4, 64, 64
 const sad12_mask, align=8
 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
 endconst
+
+#else // HIGH_BIT_DEPTH
+
+// int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+.macro SAD_4 f
+    ld1             {v0.4h}, [x0], x1
+    ld1             {v1.4h}, [x2], x3
+    \f              v16.4s, v0.4h, v1.4h
+.endm
+
+.macro SAD_4xH h
+    SAD_4 uabdl
+.rept \h - 1
+    SAD_4 uabal
+.endr
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_8x2 f
+    ld1             {v0.8h}, [x0], x1
+    ld1             {v1.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v1.8h
+
+    ld1             {v0.8h}, [x0], x1
+    ld1             {v1.8h}, [x2], x3
+    \f              v17.8h, v0.8h, v1.8h
+.endm
+
+.macro SAD_8xH h
+    SAD_8x2 uabd
+.rept \h / 2 - 1
+    SAD_8x2 uaba
+.endr
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w\()xH \h
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+.macro SAD_12 f
+    ldr             q0, [x0]
+    ldr             q1, [x2]
+    ldr             d2, [x0, #16]
+    ldr             d3, [x2, #16]
+    \f              v16.8h, v0.8h, v1.8h
+    \f              v17.4h, v2.4h, v3.4h
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+.macro SAD_16 f
+    ld1             {v0.8h-v1.8h}, [x0], x1
+    ld1             {v2.8h-v3.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v2.8h
+    \f              v17.8h, v1.8h, v3.8h
+.endm
+
+.macro SAD_32 f
+    ld1             {v0.8h-v3.8h}, [x0], x1
+    ld1             {v4.8h-v7.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v4.8h
+    \f              v17.8h, v1.8h, v5.8h
+    \f              v18.8h, v2.8h, v6.8h
+    \f              v19.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_END_2_ACCUM
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+.endm
+
+.macro SAD_END_2_ACCUM_WIDEN
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_END_4_ACCUM_WIDEN
+    add             v16.8h, v16.8h, v17.8h
+    add             v18.8h, v18.8h, v19.8h
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v18.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC_LOOP w, h end_type
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w uabd
+    SAD_\w uaba
+
+    mov             w9, #(\h - 2)/2
+
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 2
+    SAD_\w uaba
+.endr
+    cbnz            w9, .Loop_\w\()x\h
+
+    SAD_\end_type
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+// SAD_<w>_WIDEN kernels widen into 32-bit accumulators.
+.macro SAD_16_WIDEN f
+    ld1             {v0.8h-v1.8h}, [x0], x1
+    ld1             {v2.8h-v3.8h}, [x2], x3
+    uabd            v18.8h, v0.8h, v2.8h
+    \f              v16.4s, v18.8h
+    uabd            v19.8h, v1.8h, v3.8h
+    \f              v17.4s, v19.8h
+.endm
+
+.macro SAD_24_WIDEN f
+    ld1             {v0.8h-v2.8h}, [x0], x1
+    ld1             {v3.8h-v5.8h}, [x2], x3
+    uabd            v19.8h, v0.8h, v3.8h
+    \f              v16.4s, v19.8h
+    uabd            v20.8h, v1.8h, v4.8h
+    \f              v17.4s, v20.8h
+    uabd            v21.8h, v2.8h, v5.8h
+    \f              v18.4s, v21.8h
+.endm
+
+.macro SAD_32_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0], x1
+    ld1             {v4.8h-v7.8h}, [x2], x3
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+.endm
+
+.macro SAD_48_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0]
+    ld1             {v4.8h-v7.8h}, [x2]
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+
+    ldp             q0, q1, [x0, #64]
+    ldp             q4, q5, [x2, #64]
+    uabd            v20.8h, v0.8h, v4.8h
+    uadalp          v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    uadalp          v17.4s, v21.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+.macro SAD_64_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0]
+    ld1             {v4.8h-v7.8h}, [x2]
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+
+    ldp             q0, q1, [x0, #64]
+    ldp             q2, q3, [x0, #96]
+    ldp             q4, q5, [x2, #64]
+    ldp             q6, q7, [x2, #96]
+    uabd            v20.8h, v0.8h, v4.8h
+    uadalp          v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    uadalp          v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    uadalp          v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    uadalp          v19.4s, v23.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+
+.macro SAD_FUNC_LOOP_LARGE w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w\()_WIDEN uaddlp
+    SAD_\w\()_WIDEN uadalp
+
+    mov             w9, #(\h - 2)/2
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 2
+    SAD_\w\()_WIDEN uadalp
+.endr
+    cbnz            w9, .Loop_\w\()x\h
+
+    add             v16.4s, v16.4s, v17.4s
+.if \w != 16
+.if \w != 24
+    add             v18.4s, v18.4s, v19.4s
+.endif
+    add             v16.4s, v16.4s, v18.4s
+.endif
+    addv            s0, v16.4s
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SAD_FUNC  4, 4
+SAD_FUNC  4, 8
+SAD_FUNC  4, 16
+SAD_FUNC  8, 4
+SAD_FUNC  8, 8
+SAD_FUNC  8, 16
+SAD_FUNC  8, 32
+SAD_FUNC_LOOP  12, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  16, 4, END_2_ACCUM
+SAD_FUNC_LOOP  16, 8, END_2_ACCUM
+SAD_FUNC_LOOP  16, 12, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  16, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  32, 8, END_4_ACCUM_WIDEN
+SAD_FUNC_LOOP_LARGE  16, 32
+SAD_FUNC_LOOP_LARGE  16, 64
+SAD_FUNC_LOOP_LARGE  24, 32
+SAD_FUNC_LOOP_LARGE  32, 16
+SAD_FUNC_LOOP_LARGE  32, 24
+SAD_FUNC_LOOP_LARGE  32, 32
+SAD_FUNC_LOOP_LARGE  32, 64
+SAD_FUNC_LOOP_LARGE  48, 64
+SAD_FUNC_LOOP_LARGE  64, 16
+SAD_FUNC_LOOP_LARGE  64, 32
+SAD_FUNC_LOOP_LARGE  64, 48
+SAD_FUNC_LOOP_LARGE  64, 64
+
+#endif // !HIGH_BIT_DEPTH
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From 0633b49a5b0cedbc6cc1d0f7b2feef1bc232c530 Mon Sep 17 00:00:00 2001
Message-Id: <0633b49a5b0cedbc6cc1d0f7b2feef1bc232c530.1731667226.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1731667226.git.gerdazsejke.more at arm.com>
References: <cover.1731667226.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 13 Oct 2024 19:35:03 +0200
Subject: [PATCH 1/3] AArch64: Add Neon asm implementation of HBD SAD

Add a Neon asm implementation of high bitdepth SAD functions for all
block sizes. This implementation is 13-20% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_pp_neon<w,h>
implementation.
---
 source/common/aarch64/asm-primitives.cpp |   4 +-
 source/common/aarch64/sad-a.S            | 271 ++++++++++++++++++++++-
 2 files changed, 273 insertions(+), 2 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index dd3c2a4ba..4cab2d66f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -524,6 +524,9 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
     p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
 
+    // sad
+    ALL_LUMA_PU(sad, pixel_sad, neon);
+
 #if !HIGH_BIT_DEPTH
     // pixel_avg_pp
     ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -538,7 +541,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
 
     // sad
-    ALL_LUMA_PU(sad, pixel_sad, neon);
     ALL_LUMA_PU(sad_x3, sad_x3, neon);
     ALL_LUMA_PU(sad_x4, sad_x4, neon);
 
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 0feffc7a9..80c8ffdcb 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -3,7 +3,8 @@
  *
  * Authors: Hongbin Liu <liuhongbin1 at huawei.com>
  *          Sebastian Pop <spop at amazon.com>
-            Hari Limaye <hari.limaye at arm.com>
+ *          Hari Limaye <hari.limaye at arm.com>
+ *          Gerda Zsejke More <gerdazsejke.more at arm.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 
 .text
 
+#if !HIGH_BIT_DEPTH
 .macro SAD_START_4 f
     ldr             s0, [x0]
     ldr             s1, [x2]
@@ -653,3 +655,270 @@ SAD_X_LOOP  4, 64, 64
 const sad12_mask, align=8
 .byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
 endconst
+
+#else // HIGH_BIT_DEPTH
+
+// int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+.macro SAD_4 f
+    ld1             {v0.4h}, [x0], x1
+    ld1             {v1.4h}, [x2], x3
+    \f              v16.4s, v0.4h, v1.4h
+.endm
+
+.macro SAD_4xH h
+    SAD_4 uabdl
+.rept \h - 1
+    SAD_4 uabal
+.endr
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_8x2 f
+    ld1             {v0.8h}, [x0], x1
+    ld1             {v1.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v1.8h
+
+    ld1             {v0.8h}, [x0], x1
+    ld1             {v1.8h}, [x2], x3
+    \f              v17.8h, v0.8h, v1.8h
+.endm
+
+.macro SAD_8xH h
+    SAD_8x2 uabd
+.rept \h / 2 - 1
+    SAD_8x2 uaba
+.endr
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w\()xH \h
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+.macro SAD_12 f
+    ldr             q0, [x0]
+    ldr             q1, [x2]
+    ldr             d2, [x0, #16]
+    ldr             d3, [x2, #16]
+    \f              v16.8h, v0.8h, v1.8h
+    \f              v17.4h, v2.4h, v3.4h
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+.macro SAD_16 f
+    ld1             {v0.8h-v1.8h}, [x0], x1
+    ld1             {v2.8h-v3.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v2.8h
+    \f              v17.8h, v1.8h, v3.8h
+.endm
+
+.macro SAD_32 f
+    ld1             {v0.8h-v3.8h}, [x0], x1
+    ld1             {v4.8h-v7.8h}, [x2], x3
+    \f              v16.8h, v0.8h, v4.8h
+    \f              v17.8h, v1.8h, v5.8h
+    \f              v18.8h, v2.8h, v6.8h
+    \f              v19.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_END_2_ACCUM
+    add             v16.8h, v16.8h, v17.8h
+    uaddlv          s0, v16.8h
+.endm
+
+.macro SAD_END_2_ACCUM_WIDEN
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v17.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_END_4_ACCUM_WIDEN
+    add             v16.8h, v16.8h, v17.8h
+    add             v18.8h, v18.8h, v19.8h
+    uaddlp          v16.4s, v16.8h
+    uadalp          v16.4s, v18.8h
+    addv            s0, v16.4s
+.endm
+
+.macro SAD_FUNC_LOOP w, h end_type
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w uabd
+    SAD_\w uaba
+
+    mov             w9, #(\h - 2)/2
+
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 2
+    SAD_\w uaba
+.endr
+    cbnz            w9, .Loop_\w\()x\h
+
+    SAD_\end_type
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+// SAD_<w>_WIDEN kernels widen into 32-bit accumulators.
+.macro SAD_16_WIDEN f
+    ld1             {v0.8h-v1.8h}, [x0], x1
+    ld1             {v2.8h-v3.8h}, [x2], x3
+    uabd            v18.8h, v0.8h, v2.8h
+    \f              v16.4s, v18.8h
+    uabd            v19.8h, v1.8h, v3.8h
+    \f              v17.4s, v19.8h
+.endm
+
+.macro SAD_24_WIDEN f
+    ld1             {v0.8h-v2.8h}, [x0], x1
+    ld1             {v3.8h-v5.8h}, [x2], x3
+    uabd            v19.8h, v0.8h, v3.8h
+    \f              v16.4s, v19.8h
+    uabd            v20.8h, v1.8h, v4.8h
+    \f              v17.4s, v20.8h
+    uabd            v21.8h, v2.8h, v5.8h
+    \f              v18.4s, v21.8h
+.endm
+
+.macro SAD_32_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0], x1
+    ld1             {v4.8h-v7.8h}, [x2], x3
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+.endm
+
+.macro SAD_48_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0]
+    ld1             {v4.8h-v7.8h}, [x2]
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+
+    ldp             q0, q1, [x0, #64]
+    ldp             q4, q5, [x2, #64]
+    uabd            v20.8h, v0.8h, v4.8h
+    uadalp          v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    uadalp          v17.4s, v21.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+.macro SAD_64_WIDEN f
+    ld1             {v0.8h-v3.8h}, [x0]
+    ld1             {v4.8h-v7.8h}, [x2]
+    uabd            v20.8h, v0.8h, v4.8h
+    \f              v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    \f              v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    \f              v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    \f              v19.4s, v23.8h
+
+    ldp             q0, q1, [x0, #64]
+    ldp             q2, q3, [x0, #96]
+    ldp             q4, q5, [x2, #64]
+    ldp             q6, q7, [x2, #96]
+    uabd            v20.8h, v0.8h, v4.8h
+    uadalp          v16.4s, v20.8h
+    uabd            v21.8h, v1.8h, v5.8h
+    uadalp          v17.4s, v21.8h
+    uabd            v22.8h, v2.8h, v6.8h
+    uadalp          v18.4s, v22.8h
+    uabd            v23.8h, v3.8h, v7.8h
+    uadalp          v19.4s, v23.8h
+
+    add             x0, x0, x1
+    add             x2, x2, x3
+.endm
+
+
+.macro SAD_FUNC_LOOP_LARGE w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+    // Stride is given in terms of pixel channel size, so double to get number of bytes.
+    add             x1, x1, x1
+    add             x3, x3, x3
+
+    SAD_\w\()_WIDEN uaddlp
+    SAD_\w\()_WIDEN uadalp
+
+    mov             w9, #(\h - 2)/2
+.Loop_\w\()x\h:
+    sub             w9, w9, #1
+.rept 2
+    SAD_\w\()_WIDEN uadalp
+.endr
+    cbnz            w9, .Loop_\w\()x\h
+
+    add             v16.4s, v16.4s, v17.4s
+.if \w != 16
+.if \w != 24
+    add             v18.4s, v18.4s, v19.4s
+.endif
+    add             v16.4s, v16.4s, v18.4s
+.endif
+    addv            s0, v16.4s
+
+    fmov            w0, s0
+    ret
+endfunc
+.endm
+
+SAD_FUNC  4, 4
+SAD_FUNC  4, 8
+SAD_FUNC  4, 16
+SAD_FUNC  8, 4
+SAD_FUNC  8, 8
+SAD_FUNC  8, 16
+SAD_FUNC  8, 32
+SAD_FUNC_LOOP  12, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  16, 4, END_2_ACCUM
+SAD_FUNC_LOOP  16, 8, END_2_ACCUM
+SAD_FUNC_LOOP  16, 12, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  16, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP  32, 8, END_4_ACCUM_WIDEN
+SAD_FUNC_LOOP_LARGE  16, 32
+SAD_FUNC_LOOP_LARGE  16, 64
+SAD_FUNC_LOOP_LARGE  24, 32
+SAD_FUNC_LOOP_LARGE  32, 16
+SAD_FUNC_LOOP_LARGE  32, 24
+SAD_FUNC_LOOP_LARGE  32, 32
+SAD_FUNC_LOOP_LARGE  32, 64
+SAD_FUNC_LOOP_LARGE  48, 64
+SAD_FUNC_LOOP_LARGE  64, 16
+SAD_FUNC_LOOP_LARGE  64, 32
+SAD_FUNC_LOOP_LARGE  64, 48
+SAD_FUNC_LOOP_LARGE  64, 64
+
+#endif // !HIGH_BIT_DEPTH
-- 
2.39.5 (Apple Git-154)