[x265] [PATCH 1/3] AArch64: Add Neon asm implementation of HBD SAD
Gerda Zsejke More
gerdazsejke.more at arm.com
Fri Nov 15 11:16:16 UTC 2024
Add a Neon asm implementation of high bitdepth SAD functions for all
block sizes. This implementation is 13-20% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_pp_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 4 +-
source/common/aarch64/sad-a.S | 271 ++++++++++++++++++++++-
2 files changed, 273 insertions(+), 2 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index dd3c2a4ba..4cab2d66f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -524,6 +524,9 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
+ // sad
+ ALL_LUMA_PU(sad, pixel_sad, neon);
+
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -538,7 +541,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
// sad
- ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
ALL_LUMA_PU(sad_x4, sad_x4, neon);
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 0feffc7a9..80c8ffdcb 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -3,7 +3,8 @@
*
* Authors: Hongbin Liu <liuhongbin1 at huawei.com>
* Sebastian Pop <spop at amazon.com>
- Hari Limaye <hari.limaye at arm.com>
+ * Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
.text
+#if !HIGH_BIT_DEPTH
.macro SAD_START_4 f
ldr s0, [x0]
ldr s1, [x2]
@@ -653,3 +655,270 @@ SAD_X_LOOP 4, 64, 64
const sad12_mask, align=8
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
endconst
+
+#else // HIGH_BIT_DEPTH
+
+// int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+.macro SAD_4 f
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v1.4h}, [x2], x3
+ \f v16.4s, v0.4h, v1.4h
+.endm
+
+.macro SAD_4xH h
+ SAD_4 uabdl
+.rept \h - 1
+ SAD_4 uabal
+.endr
+ addv s0, v16.4s
+.endm
+
+.macro SAD_8x2 f
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x2], x3
+ \f v16.8h, v0.8h, v1.8h
+
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x2], x3
+ \f v17.8h, v0.8h, v1.8h
+.endm
+
+.macro SAD_8xH h
+ SAD_8x2 uabd
+.rept \h / 2 - 1
+ SAD_8x2 uaba
+.endr
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w\()xH \h
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+.macro SAD_12 f
+ ldr q0, [x0]
+ ldr q1, [x2]
+ ldr d2, [x0, #16]
+ ldr d3, [x2, #16]
+ \f v16.8h, v0.8h, v1.8h
+ \f v17.4h, v2.4h, v3.4h
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SAD_16 f
+ ld1 {v0.8h-v1.8h}, [x0], x1
+ ld1 {v2.8h-v3.8h}, [x2], x3
+ \f v16.8h, v0.8h, v2.8h
+ \f v17.8h, v1.8h, v3.8h
+.endm
+
+.macro SAD_32 f
+ ld1 {v0.8h-v3.8h}, [x0], x1
+ ld1 {v4.8h-v7.8h}, [x2], x3
+ \f v16.8h, v0.8h, v4.8h
+ \f v17.8h, v1.8h, v5.8h
+ \f v18.8h, v2.8h, v6.8h
+ \f v19.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_END_2_ACCUM
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+.endm
+
+.macro SAD_END_2_ACCUM_WIDEN
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_END_4_ACCUM_WIDEN
+ add v16.8h, v16.8h, v17.8h
+ add v18.8h, v18.8h, v19.8h
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v18.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_FUNC_LOOP w, h end_type
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w uabd
+ SAD_\w uaba
+
+ mov w9, #(\h - 2)/2
+
+.Loop_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_\w uaba
+.endr
+ cbnz w9, .Loop_\w\()x\h
+
+ SAD_\end_type
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+// SAD_<w>_WIDEN kernels widen into 32-bit accumulators.
+.macro SAD_16_WIDEN f
+ ld1 {v0.8h-v1.8h}, [x0], x1
+ ld1 {v2.8h-v3.8h}, [x2], x3
+ uabd v18.8h, v0.8h, v2.8h
+ \f v16.4s, v18.8h
+ uabd v19.8h, v1.8h, v3.8h
+ \f v17.4s, v19.8h
+.endm
+
+.macro SAD_24_WIDEN f
+ ld1 {v0.8h-v2.8h}, [x0], x1
+ ld1 {v3.8h-v5.8h}, [x2], x3
+ uabd v19.8h, v0.8h, v3.8h
+ \f v16.4s, v19.8h
+ uabd v20.8h, v1.8h, v4.8h
+ \f v17.4s, v20.8h
+ uabd v21.8h, v2.8h, v5.8h
+ \f v18.4s, v21.8h
+.endm
+
+.macro SAD_32_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0], x1
+ ld1 {v4.8h-v7.8h}, [x2], x3
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+.endm
+
+.macro SAD_48_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v4.8h-v7.8h}, [x2]
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+
+ ldp q0, q1, [x0, #64]
+ ldp q4, q5, [x2, #64]
+ uabd v20.8h, v0.8h, v4.8h
+ uadalp v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ uadalp v17.4s, v21.8h
+
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SAD_64_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v4.8h-v7.8h}, [x2]
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+
+ ldp q0, q1, [x0, #64]
+ ldp q2, q3, [x0, #96]
+ ldp q4, q5, [x2, #64]
+ ldp q6, q7, [x2, #96]
+ uabd v20.8h, v0.8h, v4.8h
+ uadalp v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ uadalp v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ uadalp v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ uadalp v19.4s, v23.8h
+
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+
+.macro SAD_FUNC_LOOP_LARGE w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w\()_WIDEN uaddlp
+ SAD_\w\()_WIDEN uadalp
+
+ mov w9, #(\h - 2)/2
+.Loop_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_\w\()_WIDEN uadalp
+.endr
+ cbnz w9, .Loop_\w\()x\h
+
+ add v16.4s, v16.4s, v17.4s
+.if \w != 16
+.if \w != 24
+ add v18.4s, v18.4s, v19.4s
+.endif
+ add v16.4s, v16.4s, v18.4s
+.endif
+ addv s0, v16.4s
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+SAD_FUNC 4, 4
+SAD_FUNC 4, 8
+SAD_FUNC 4, 16
+SAD_FUNC 8, 4
+SAD_FUNC 8, 8
+SAD_FUNC 8, 16
+SAD_FUNC 8, 32
+SAD_FUNC_LOOP 12, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 16, 4, END_2_ACCUM
+SAD_FUNC_LOOP 16, 8, END_2_ACCUM
+SAD_FUNC_LOOP 16, 12, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 16, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 32, 8, END_4_ACCUM_WIDEN
+SAD_FUNC_LOOP_LARGE 16, 32
+SAD_FUNC_LOOP_LARGE 16, 64
+SAD_FUNC_LOOP_LARGE 24, 32
+SAD_FUNC_LOOP_LARGE 32, 16
+SAD_FUNC_LOOP_LARGE 32, 24
+SAD_FUNC_LOOP_LARGE 32, 32
+SAD_FUNC_LOOP_LARGE 32, 64
+SAD_FUNC_LOOP_LARGE 48, 64
+SAD_FUNC_LOOP_LARGE 64, 16
+SAD_FUNC_LOOP_LARGE 64, 32
+SAD_FUNC_LOOP_LARGE 64, 48
+SAD_FUNC_LOOP_LARGE 64, 64
+
+#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 0633b49a5b0cedbc6cc1d0f7b2feef1bc232c530 Mon Sep 17 00:00:00 2001
Message-Id: <0633b49a5b0cedbc6cc1d0f7b2feef1bc232c530.1731667226.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1731667226.git.gerdazsejke.more at arm.com>
References: <cover.1731667226.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sun, 13 Oct 2024 19:35:03 +0200
Subject: [PATCH 1/3] AArch64: Add Neon asm implementation of HBD SAD
Add a Neon asm implementation of high bitdepth SAD functions for all
block sizes. This implementation is 13-20% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_pp_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 4 +-
source/common/aarch64/sad-a.S | 271 ++++++++++++++++++++++-
2 files changed, 273 insertions(+), 2 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index dd3c2a4ba..4cab2d66f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -524,6 +524,9 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
+ // sad
+ ALL_LUMA_PU(sad, pixel_sad, neon);
+
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -538,7 +541,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
// sad
- ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
ALL_LUMA_PU(sad_x4, sad_x4, neon);
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 0feffc7a9..80c8ffdcb 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -3,7 +3,8 @@
*
* Authors: Hongbin Liu <liuhongbin1 at huawei.com>
* Sebastian Pop <spop at amazon.com>
- Hari Limaye <hari.limaye at arm.com>
+ * Hari Limaye <hari.limaye at arm.com>
+ * Gerda Zsejke More <gerdazsejke.more at arm.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
.text
+#if !HIGH_BIT_DEPTH
.macro SAD_START_4 f
ldr s0, [x0]
ldr s1, [x2]
@@ -653,3 +655,270 @@ SAD_X_LOOP 4, 64, 64
const sad12_mask, align=8
.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
endconst
+
+#else // HIGH_BIT_DEPTH
+
+// int sad(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2)
+.macro SAD_4 f
+ ld1 {v0.4h}, [x0], x1
+ ld1 {v1.4h}, [x2], x3
+ \f v16.4s, v0.4h, v1.4h
+.endm
+
+.macro SAD_4xH h
+ SAD_4 uabdl
+.rept \h - 1
+ SAD_4 uabal
+.endr
+ addv s0, v16.4s
+.endm
+
+.macro SAD_8x2 f
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x2], x3
+ \f v16.8h, v0.8h, v1.8h
+
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v1.8h}, [x2], x3
+ \f v17.8h, v0.8h, v1.8h
+.endm
+
+.macro SAD_8xH h
+ SAD_8x2 uabd
+.rept \h / 2 - 1
+ SAD_8x2 uaba
+.endr
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_FUNC w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w\()xH \h
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+.macro SAD_12 f
+ ldr q0, [x0]
+ ldr q1, [x2]
+ ldr d2, [x0, #16]
+ ldr d3, [x2, #16]
+ \f v16.8h, v0.8h, v1.8h
+ \f v17.4h, v2.4h, v3.4h
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SAD_16 f
+ ld1 {v0.8h-v1.8h}, [x0], x1
+ ld1 {v2.8h-v3.8h}, [x2], x3
+ \f v16.8h, v0.8h, v2.8h
+ \f v17.8h, v1.8h, v3.8h
+.endm
+
+.macro SAD_32 f
+ ld1 {v0.8h-v3.8h}, [x0], x1
+ ld1 {v4.8h-v7.8h}, [x2], x3
+ \f v16.8h, v0.8h, v4.8h
+ \f v17.8h, v1.8h, v5.8h
+ \f v18.8h, v2.8h, v6.8h
+ \f v19.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_END_2_ACCUM
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+.endm
+
+.macro SAD_END_2_ACCUM_WIDEN
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_END_4_ACCUM_WIDEN
+ add v16.8h, v16.8h, v17.8h
+ add v18.8h, v18.8h, v19.8h
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v18.8h
+ addv s0, v16.4s
+.endm
+
+.macro SAD_FUNC_LOOP w, h end_type
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w uabd
+ SAD_\w uaba
+
+ mov w9, #(\h - 2)/2
+
+.Loop_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_\w uaba
+.endr
+ cbnz w9, .Loop_\w\()x\h
+
+ SAD_\end_type
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+// SAD_<w>_WIDEN kernels widen into 32-bit accumulators.
+.macro SAD_16_WIDEN f
+ ld1 {v0.8h-v1.8h}, [x0], x1
+ ld1 {v2.8h-v3.8h}, [x2], x3
+ uabd v18.8h, v0.8h, v2.8h
+ \f v16.4s, v18.8h
+ uabd v19.8h, v1.8h, v3.8h
+ \f v17.4s, v19.8h
+.endm
+
+.macro SAD_24_WIDEN f
+ ld1 {v0.8h-v2.8h}, [x0], x1
+ ld1 {v3.8h-v5.8h}, [x2], x3
+ uabd v19.8h, v0.8h, v3.8h
+ \f v16.4s, v19.8h
+ uabd v20.8h, v1.8h, v4.8h
+ \f v17.4s, v20.8h
+ uabd v21.8h, v2.8h, v5.8h
+ \f v18.4s, v21.8h
+.endm
+
+.macro SAD_32_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0], x1
+ ld1 {v4.8h-v7.8h}, [x2], x3
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+.endm
+
+.macro SAD_48_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v4.8h-v7.8h}, [x2]
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+
+ ldp q0, q1, [x0, #64]
+ ldp q4, q5, [x2, #64]
+ uabd v20.8h, v0.8h, v4.8h
+ uadalp v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ uadalp v17.4s, v21.8h
+
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SAD_64_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v4.8h-v7.8h}, [x2]
+ uabd v20.8h, v0.8h, v4.8h
+ \f v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ \f v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ \f v19.4s, v23.8h
+
+ ldp q0, q1, [x0, #64]
+ ldp q2, q3, [x0, #96]
+ ldp q4, q5, [x2, #64]
+ ldp q6, q7, [x2, #96]
+ uabd v20.8h, v0.8h, v4.8h
+ uadalp v16.4s, v20.8h
+ uabd v21.8h, v1.8h, v5.8h
+ uadalp v17.4s, v21.8h
+ uabd v22.8h, v2.8h, v6.8h
+ uadalp v18.4s, v22.8h
+ uabd v23.8h, v3.8h, v7.8h
+ uadalp v19.4s, v23.8h
+
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+
+.macro SAD_FUNC_LOOP_LARGE w, h
+function PFX(pixel_sad_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x1, x1, x1
+ add x3, x3, x3
+
+ SAD_\w\()_WIDEN uaddlp
+ SAD_\w\()_WIDEN uadalp
+
+ mov w9, #(\h - 2)/2
+.Loop_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_\w\()_WIDEN uadalp
+.endr
+ cbnz w9, .Loop_\w\()x\h
+
+ add v16.4s, v16.4s, v17.4s
+.if \w != 16
+.if \w != 24
+ add v18.4s, v18.4s, v19.4s
+.endif
+ add v16.4s, v16.4s, v18.4s
+.endif
+ addv s0, v16.4s
+
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+SAD_FUNC 4, 4
+SAD_FUNC 4, 8
+SAD_FUNC 4, 16
+SAD_FUNC 8, 4
+SAD_FUNC 8, 8
+SAD_FUNC 8, 16
+SAD_FUNC 8, 32
+SAD_FUNC_LOOP 12, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 16, 4, END_2_ACCUM
+SAD_FUNC_LOOP 16, 8, END_2_ACCUM
+SAD_FUNC_LOOP 16, 12, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 16, 16, END_2_ACCUM_WIDEN
+SAD_FUNC_LOOP 32, 8, END_4_ACCUM_WIDEN
+SAD_FUNC_LOOP_LARGE 16, 32
+SAD_FUNC_LOOP_LARGE 16, 64
+SAD_FUNC_LOOP_LARGE 24, 32
+SAD_FUNC_LOOP_LARGE 32, 16
+SAD_FUNC_LOOP_LARGE 32, 24
+SAD_FUNC_LOOP_LARGE 32, 32
+SAD_FUNC_LOOP_LARGE 32, 64
+SAD_FUNC_LOOP_LARGE 48, 64
+SAD_FUNC_LOOP_LARGE 64, 16
+SAD_FUNC_LOOP_LARGE 64, 32
+SAD_FUNC_LOOP_LARGE 64, 48
+SAD_FUNC_LOOP_LARGE 64, 64
+
+#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list