[x265] [PATCH 2/3] AArch64: Add Neon asm implementation of HBD SAD3D
Gerda Zsejke More
gerdazsejke.more at arm.com
Fri Nov 15 11:18:01 UTC 2024
Add a Neon asm implementation of high bitdepth SAD3D functions for
all block sizes. This implementation is 20%-27% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_x3_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 2 +-
source/common/aarch64/sad-a.S | 381 +++++++++++++++++++++++
2 files changed, 382 insertions(+), 1 deletion(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4cab2d66f..283256679 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -526,6 +526,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
+ ALL_LUMA_PU(sad_x3, sad_x3, neon);
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
@@ -541,7 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
// sad
- ALL_LUMA_PU(sad_x3, sad_x3, neon);
ALL_LUMA_PU(sad_x4, sad_x4, neon);
// sse_pp
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 80c8ffdcb..642fd29f3 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -921,4 +921,385 @@ SAD_FUNC_LOOP_LARGE 64, 32
SAD_FUNC_LOOP_LARGE 64, 48
SAD_FUNC_LOOP_LARGE 64, 64
+// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
+.macro SAD_x3_4 f
+ ld1 {v0.4h}, [x0], x6
+ ld1 {v1.4h}, [x1], x4
+ ld1 {v2.4h}, [x2], x4
+ ld1 {v3.4h}, [x3], x4
+ \f v16.4s, v0.4h, v1.4h
+ \f v17.4s, v0.4h, v2.4h
+ \f v18.4s, v0.4h, v3.4h
+.endm
+
+.macro SAD_x3_4xH h
+ SAD_x3_4 uabdl
+.rept \h - 1
+ SAD_x3_4 uabal
+.endr
+.endm
+
+.macro SAD_x3_8x2 f
+ ld1 {v0.8h}, [x0], x6
+ ld1 {v1.8h}, [x1], x4
+ ld1 {v2.8h}, [x2], x4
+ ld1 {v3.8h}, [x3], x4
+ \f v16.8h, v0.8h, v1.8h
+ \f v17.8h, v0.8h, v2.8h
+ \f v18.8h, v0.8h, v3.8h
+
+ ld1 {v0.8h}, [x0], x6
+ ld1 {v1.8h}, [x1], x4
+ ld1 {v2.8h}, [x2], x4
+ ld1 {v3.8h}, [x3], x4
+ \f v19.8h, v0.8h, v1.8h
+ \f v20.8h, v0.8h, v2.8h
+ \f v21.8h, v0.8h, v3.8h
+.endm
+
+.macro SAD_x3_8xH h
+ SAD_x3_8x2 uabd
+.rept \h/2 - 1
+ SAD_x3_8x2 uaba
+.endr
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v19.8h
+ uaddlp v17.4s, v17.8h
+ uadalp v17.4s, v20.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v21.8h
+.endm
+
+.macro SAD_x3_FUNC w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w\()xH \h
+
+ addp v0.4s, v16.4s, v17.4s
+ addp v1.4s, v18.4s, v18.4s
+ addp v0.4s, v0.4s, v1.4s
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+.macro SAD_x3_12 f
+ ldr q0, [x0]
+ ldr q1, [x1]
+ ldr q2, [x2]
+ ldr q3, [x3]
+ ldr d4, [x0, #16]
+ ldr d5, [x1, #16]
+ ldr d6, [x2, #16]
+ ldr d7, [x3, #16]
+ \f v16.8h, v0.8h, v1.8h
+ \f v18.8h, v0.8h, v2.8h
+ \f v20.8h, v0.8h, v3.8h
+ \f v17.8h, v4.8h, v5.8h
+ \f v19.8h, v4.8h, v6.8h
+ \f v21.8h, v4.8h, v7.8h
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_16 f
+ ld1 {v0.8h-v1.8h}, [x0], x6
+ ld1 {v2.8h-v3.8h}, [x1], x4
+ \f v16.8h, v0.8h, v2.8h
+ \f v17.8h, v1.8h, v3.8h
+ ld1 {v4.8h-v5.8h}, [x2], x4
+ \f v18.8h, v0.8h, v4.8h
+ \f v19.8h, v1.8h, v5.8h
+ ld1 {v6.8h-v7.8h}, [x3], x4
+ \f v20.8h, v0.8h, v6.8h
+ \f v21.8h, v1.8h, v7.8h
+.endm
+
+.macro SAD_x3_32 f
+ ld1 {v0.8h-v3.8h}, [x0], x6
+ ld1 {v4.8h-v7.8h}, [x1], x4
+ \f v16.8h, v0.8h, v4.8h
+ uaba v16.8h, v1.8h, v5.8h
+ \f v17.8h, v2.8h, v6.8h
+ uaba v17.8h, v3.8h, v7.8h
+ ld1 {v4.8h-v7.8h},[x2], x4
+ \f v18.8h, v0.8h, v4.8h
+ uaba v18.8h, v1.8h, v5.8h
+ \f v19.8h, v2.8h, v6.8h
+ uaba v19.8h, v3.8h, v7.8h
+ ld1 {v4.8h-v7.8h},[x3], x4
+ \f v20.8h, v0.8h, v4.8h
+ uaba v20.8h, v1.8h, v5.8h
+ \f v21.8h, v2.8h, v6.8h
+ uaba v21.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_x3_FUNC_LOOP w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w uabd
+
+ mov w9, #\h - 1
+.Loop_x_\w\()x\h:
+ sub w9, w9, #1
+ SAD_x3_\w uaba
+ cbnz w9, .Loop_x_\w\()x\h
+
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v19.8h
+ uaddlp v20.4s, v20.8h
+ uadalp v20.4s, v21.8h
+ addp v0.4s, v16.4s, v18.4s
+ addp v1.4s, v20.4s, v20.4s
+ addp v0.4s, v0.4s, v1.4s
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+.macro SAD_x3_16_WIDEN f
+ ld1 {v0.8h-v1.8h}, [x0], x6
+ ld1 {v2.8h-v3.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v2.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v1.8h, v3.8h
+ \f v17.4s, v23.8h
+ ld1 {v4.8h-v5.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v4.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v1.8h, v5.8h
+ \f v19.4s, v25.8h
+ ld1 {v6.8h-v7.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v6.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v1.8h, v7.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_24_WIDEN f
+ ld1 {v0.8h-v2.8h}, [x0], x6
+ ld1 {v3.8h-v5.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v3.8h
+ uaba v22.8h, v1.8h, v4.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v5.8h
+ \f v17.4s, v23.8h
+ ld1 {v28.8h-v30.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v28.8h
+ uaba v24.8h, v1.8h, v29.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v2.8h, v30.8h
+ \f v19.4s, v25.8h
+ ld1 {v3.8h-v5.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v3.8h
+ uaba v26.8h, v1.8h, v4.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v5.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_32_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0], x6
+ ld1 {v4.8h-v7.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+ \f v17.4s, v23.8h
+
+ ld1 {v4.8h-v7.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v4.8h
+ uaba v24.8h, v1.8h, v5.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v2.8h, v6.8h
+ uaba v25.8h, v3.8h, v7.8h
+ \f v19.4s, v25.8h
+
+ ld1 {v4.8h-v7.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v4.8h
+ uaba v26.8h, v1.8h, v5.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v6.8h
+ uaba v27.8h, v3.8h, v7.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_48_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v28.8h-v31.8h}, [x1]
+ uabd v6.8h, v0.8h, v28.8h
+ uaba v6.8h, v1.8h, v29.8h
+ \f v16.4s, v6.8h
+ uabd v7.8h, v2.8h, v30.8h
+ uaba v7.8h, v3.8h, v31.8h
+ \f v17.4s, v7.8h
+ ldp q4, q5, [x0, #64]
+ ldp q28, q29, [x1, #64]
+ uabd v22.8h, v4.8h, v28.8h
+ uaba v22.8h, v5.8h, v29.8h
+ uadalp v16.4s, v22.8h
+
+ ld1 {v28.8h-v31.8h}, [x2]
+ uabd v23.8h, v0.8h, v28.8h
+ uaba v23.8h, v1.8h, v29.8h
+ \f v18.4s, v23.8h
+ uabd v24.8h, v2.8h, v30.8h
+ uaba v24.8h, v3.8h, v31.8h
+ \f v19.4s, v24.8h
+ ldp q28, q29, [x2, #64]
+ uabd v25.8h, v4.8h, v28.8h
+ uaba v25.8h, v5.8h, v29.8h
+ uadalp v18.4s, v25.8h
+
+ ld1 {v28.8h-v31.8h}, [x3]
+ uabd v26.8h, v0.8h, v28.8h
+ uaba v26.8h, v1.8h, v29.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v30.8h
+ uaba v27.8h, v3.8h, v31.8h
+ \f v21.4s, v27.8h
+ ldp q28, q29, [x3, #64]
+ uabd v6.8h, v4.8h, v28.8h
+ uaba v6.8h, v5.8h, v29.8h
+ uadalp v20.4s, v6.8h
+
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_64_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v28.8h-v31.8h}, [x1]
+ uabd v22.8h, v0.8h, v28.8h
+ uaba v22.8h, v1.8h, v29.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v30.8h
+ uaba v23.8h, v3.8h, v31.8h
+ \f v17.4s, v23.8h
+ ldp q4, q5, [x0, #64]
+ ldp q6, q7, [x0, #96]
+ ldp q28, q29, [x1, #64]
+ ldp q30, q31, [x1, #96]
+ uabd v24.8h, v4.8h, v28.8h
+ uaba v24.8h, v5.8h, v29.8h
+ uadalp v16.4s, v24.8h
+ uabd v25.8h, v6.8h, v30.8h
+ uaba v25.8h, v7.8h, v31.8h
+ uadalp v17.4s, v25.8h
+
+ ld1 {v28.8h-v31.8h}, [x2]
+ uabd v26.8h, v0.8h, v28.8h
+ uaba v26.8h, v1.8h, v29.8h
+ \f v18.4s, v26.8h
+ uabd v27.8h, v2.8h, v30.8h
+ uaba v27.8h, v3.8h, v31.8h
+ \f v19.4s, v27.8h
+ ldp q28, q29, [x2, #64]
+ ldp q30, q31, [x2, #96]
+ uabd v22.8h, v4.8h, v28.8h
+ uaba v22.8h, v5.8h, v29.8h
+ uadalp v18.4s, v22.8h
+ uabd v23.8h, v6.8h, v30.8h
+ uaba v23.8h, v7.8h, v31.8h
+ uadalp v19.4s, v23.8h
+
+ ld1 {v28.8h-v31.8h}, [x3]
+ uabd v24.8h, v0.8h, v28.8h
+ uaba v24.8h, v1.8h, v29.8h
+ \f v20.4s, v24.8h
+ uabd v25.8h, v2.8h, v30.8h
+ uaba v25.8h, v3.8h, v31.8h
+ \f v21.4s, v25.8h
+ ldp q28, q29, [x3, #64]
+ ldp q30, q31, [x3, #96]
+ uabd v26.8h, v4.8h, v28.8h
+ uaba v26.8h, v5.8h, v29.8h
+ uadalp v20.4s, v26.8h
+ uabd v27.8h, v6.8h, v30.8h
+ uaba v27.8h, v7.8h, v31.8h
+ uadalp v21.4s, v27.8h
+
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_FUNC_LOOP_LARGE w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w\()_WIDEN uaddlp
+ SAD_x3_\w\()_WIDEN uadalp
+
+ mov w9, #(\h - 2)/2
+.Loop_x_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_x3_\w\()_WIDEN uadalp
+.endr
+ cbnz w9, .Loop_x_\w\()x\h
+
+ add v16.4s, v16.4s, v17.4s
+ add v17.4s, v18.4s, v19.4s
+ add v18.4s, v20.4s, v21.4s
+ addp v0.4s, v16.4s, v17.4s
+ addp v1.4s, v18.4s, v18.4s
+ addp v0.4s, v0.4s, v1.4s
+
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+SAD_x3_FUNC 4, 4
+SAD_x3_FUNC 4, 8
+SAD_x3_FUNC 4, 16
+SAD_x3_FUNC 8, 4
+SAD_x3_FUNC 8, 8
+SAD_x3_FUNC 8, 16
+SAD_x3_FUNC 8, 32
+SAD_x3_FUNC_LOOP 12, 16
+SAD_x3_FUNC_LOOP 16, 4
+SAD_x3_FUNC_LOOP 16, 8
+SAD_x3_FUNC_LOOP 16, 12
+SAD_x3_FUNC_LOOP 16, 16
+SAD_x3_FUNC_LOOP 32, 8
+SAD_x3_FUNC_LOOP_LARGE 16, 32
+SAD_x3_FUNC_LOOP_LARGE 16, 64
+SAD_x3_FUNC_LOOP_LARGE 24, 32
+SAD_x3_FUNC_LOOP_LARGE 32, 16
+SAD_x3_FUNC_LOOP_LARGE 32, 24
+SAD_x3_FUNC_LOOP_LARGE 32, 32
+SAD_x3_FUNC_LOOP_LARGE 32, 64
+SAD_x3_FUNC_LOOP_LARGE 48, 64
+SAD_x3_FUNC_LOOP_LARGE 64, 16
+SAD_x3_FUNC_LOOP_LARGE 64, 32
+SAD_x3_FUNC_LOOP_LARGE 64, 48
+SAD_x3_FUNC_LOOP_LARGE 64, 64
+
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 63ead4439c06a55f548cac11a4a47205e699cde1 Mon Sep 17 00:00:00 2001
Message-Id: <63ead4439c06a55f548cac11a4a47205e699cde1.1731667226.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1731667226.git.gerdazsejke.more at arm.com>
References: <cover.1731667226.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Mon, 28 Oct 2024 15:40:21 +0100
Subject: [PATCH 2/3] AArch64: Add Neon asm implementation of HBD SAD3D
Add a Neon asm implementation of high bitdepth SAD3D functions for
all block sizes. This implementation is 20%-27% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_x3_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 2 +-
source/common/aarch64/sad-a.S | 381 +++++++++++++++++++++++
2 files changed, 382 insertions(+), 1 deletion(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 4cab2d66f..283256679 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -526,6 +526,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
+ ALL_LUMA_PU(sad_x3, sad_x3, neon);
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
@@ -541,7 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
// sad
- ALL_LUMA_PU(sad_x3, sad_x3, neon);
ALL_LUMA_PU(sad_x4, sad_x4, neon);
// sse_pp
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 80c8ffdcb..642fd29f3 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -921,4 +921,385 @@ SAD_FUNC_LOOP_LARGE 64, 32
SAD_FUNC_LOOP_LARGE 64, 48
SAD_FUNC_LOOP_LARGE 64, 64
+// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
+.macro SAD_x3_4 f
+ ld1 {v0.4h}, [x0], x6
+ ld1 {v1.4h}, [x1], x4
+ ld1 {v2.4h}, [x2], x4
+ ld1 {v3.4h}, [x3], x4
+ \f v16.4s, v0.4h, v1.4h
+ \f v17.4s, v0.4h, v2.4h
+ \f v18.4s, v0.4h, v3.4h
+.endm
+
+.macro SAD_x3_4xH h
+ SAD_x3_4 uabdl
+.rept \h - 1
+ SAD_x3_4 uabal
+.endr
+.endm
+
+.macro SAD_x3_8x2 f
+ ld1 {v0.8h}, [x0], x6
+ ld1 {v1.8h}, [x1], x4
+ ld1 {v2.8h}, [x2], x4
+ ld1 {v3.8h}, [x3], x4
+ \f v16.8h, v0.8h, v1.8h
+ \f v17.8h, v0.8h, v2.8h
+ \f v18.8h, v0.8h, v3.8h
+
+ ld1 {v0.8h}, [x0], x6
+ ld1 {v1.8h}, [x1], x4
+ ld1 {v2.8h}, [x2], x4
+ ld1 {v3.8h}, [x3], x4
+ \f v19.8h, v0.8h, v1.8h
+ \f v20.8h, v0.8h, v2.8h
+ \f v21.8h, v0.8h, v3.8h
+.endm
+
+.macro SAD_x3_8xH h
+ SAD_x3_8x2 uabd
+.rept \h/2 - 1
+ SAD_x3_8x2 uaba
+.endr
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v19.8h
+ uaddlp v17.4s, v17.8h
+ uadalp v17.4s, v20.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v21.8h
+.endm
+
+.macro SAD_x3_FUNC w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w\()xH \h
+
+ addp v0.4s, v16.4s, v17.4s
+ addp v1.4s, v18.4s, v18.4s
+ addp v0.4s, v0.4s, v1.4s
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+.macro SAD_x3_12 f
+ ldr q0, [x0]
+ ldr q1, [x1]
+ ldr q2, [x2]
+ ldr q3, [x3]
+ ldr d4, [x0, #16]
+ ldr d5, [x1, #16]
+ ldr d6, [x2, #16]
+ ldr d7, [x3, #16]
+ \f v16.8h, v0.8h, v1.8h
+ \f v18.8h, v0.8h, v2.8h
+ \f v20.8h, v0.8h, v3.8h
+ \f v17.8h, v4.8h, v5.8h
+ \f v19.8h, v4.8h, v6.8h
+ \f v21.8h, v4.8h, v7.8h
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_16 f
+ ld1 {v0.8h-v1.8h}, [x0], x6
+ ld1 {v2.8h-v3.8h}, [x1], x4
+ \f v16.8h, v0.8h, v2.8h
+ \f v17.8h, v1.8h, v3.8h
+ ld1 {v4.8h-v5.8h}, [x2], x4
+ \f v18.8h, v0.8h, v4.8h
+ \f v19.8h, v1.8h, v5.8h
+ ld1 {v6.8h-v7.8h}, [x3], x4
+ \f v20.8h, v0.8h, v6.8h
+ \f v21.8h, v1.8h, v7.8h
+.endm
+
+.macro SAD_x3_32 f
+ ld1 {v0.8h-v3.8h}, [x0], x6
+ ld1 {v4.8h-v7.8h}, [x1], x4
+ \f v16.8h, v0.8h, v4.8h
+ uaba v16.8h, v1.8h, v5.8h
+ \f v17.8h, v2.8h, v6.8h
+ uaba v17.8h, v3.8h, v7.8h
+ ld1 {v4.8h-v7.8h},[x2], x4
+ \f v18.8h, v0.8h, v4.8h
+ uaba v18.8h, v1.8h, v5.8h
+ \f v19.8h, v2.8h, v6.8h
+ uaba v19.8h, v3.8h, v7.8h
+ ld1 {v4.8h-v7.8h},[x3], x4
+ \f v20.8h, v0.8h, v4.8h
+ uaba v20.8h, v1.8h, v5.8h
+ \f v21.8h, v2.8h, v6.8h
+ uaba v21.8h, v3.8h, v7.8h
+.endm
+
+.macro SAD_x3_FUNC_LOOP w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w uabd
+
+ mov w9, #\h - 1
+.Loop_x_\w\()x\h:
+ sub w9, w9, #1
+ SAD_x3_\w uaba
+ cbnz w9, .Loop_x_\w\()x\h
+
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v19.8h
+ uaddlp v20.4s, v20.8h
+ uadalp v20.4s, v21.8h
+ addp v0.4s, v16.4s, v18.4s
+ addp v1.4s, v20.4s, v20.4s
+ addp v0.4s, v0.4s, v1.4s
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+.macro SAD_x3_16_WIDEN f
+ ld1 {v0.8h-v1.8h}, [x0], x6
+ ld1 {v2.8h-v3.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v2.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v1.8h, v3.8h
+ \f v17.4s, v23.8h
+ ld1 {v4.8h-v5.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v4.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v1.8h, v5.8h
+ \f v19.4s, v25.8h
+ ld1 {v6.8h-v7.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v6.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v1.8h, v7.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_24_WIDEN f
+ ld1 {v0.8h-v2.8h}, [x0], x6
+ ld1 {v3.8h-v5.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v3.8h
+ uaba v22.8h, v1.8h, v4.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v5.8h
+ \f v17.4s, v23.8h
+ ld1 {v28.8h-v30.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v28.8h
+ uaba v24.8h, v1.8h, v29.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v2.8h, v30.8h
+ \f v19.4s, v25.8h
+ ld1 {v3.8h-v5.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v3.8h
+ uaba v26.8h, v1.8h, v4.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v5.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_32_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0], x6
+ ld1 {v4.8h-v7.8h}, [x1], x4
+ uabd v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+ \f v17.4s, v23.8h
+
+ ld1 {v4.8h-v7.8h}, [x2], x4
+ uabd v24.8h, v0.8h, v4.8h
+ uaba v24.8h, v1.8h, v5.8h
+ \f v18.4s, v24.8h
+ uabd v25.8h, v2.8h, v6.8h
+ uaba v25.8h, v3.8h, v7.8h
+ \f v19.4s, v25.8h
+
+ ld1 {v4.8h-v7.8h}, [x3], x4
+ uabd v26.8h, v0.8h, v4.8h
+ uaba v26.8h, v1.8h, v5.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v6.8h
+ uaba v27.8h, v3.8h, v7.8h
+ \f v21.4s, v27.8h
+.endm
+
+.macro SAD_x3_48_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v28.8h-v31.8h}, [x1]
+ uabd v6.8h, v0.8h, v28.8h
+ uaba v6.8h, v1.8h, v29.8h
+ \f v16.4s, v6.8h
+ uabd v7.8h, v2.8h, v30.8h
+ uaba v7.8h, v3.8h, v31.8h
+ \f v17.4s, v7.8h
+ ldp q4, q5, [x0, #64]
+ ldp q28, q29, [x1, #64]
+ uabd v22.8h, v4.8h, v28.8h
+ uaba v22.8h, v5.8h, v29.8h
+ uadalp v16.4s, v22.8h
+
+ ld1 {v28.8h-v31.8h}, [x2]
+ uabd v23.8h, v0.8h, v28.8h
+ uaba v23.8h, v1.8h, v29.8h
+ \f v18.4s, v23.8h
+ uabd v24.8h, v2.8h, v30.8h
+ uaba v24.8h, v3.8h, v31.8h
+ \f v19.4s, v24.8h
+ ldp q28, q29, [x2, #64]
+ uabd v25.8h, v4.8h, v28.8h
+ uaba v25.8h, v5.8h, v29.8h
+ uadalp v18.4s, v25.8h
+
+ ld1 {v28.8h-v31.8h}, [x3]
+ uabd v26.8h, v0.8h, v28.8h
+ uaba v26.8h, v1.8h, v29.8h
+ \f v20.4s, v26.8h
+ uabd v27.8h, v2.8h, v30.8h
+ uaba v27.8h, v3.8h, v31.8h
+ \f v21.4s, v27.8h
+ ldp q28, q29, [x3, #64]
+ uabd v6.8h, v4.8h, v28.8h
+ uaba v6.8h, v5.8h, v29.8h
+ uadalp v20.4s, v6.8h
+
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_64_WIDEN f
+ ld1 {v0.8h-v3.8h}, [x0]
+ ld1 {v28.8h-v31.8h}, [x1]
+ uabd v22.8h, v0.8h, v28.8h
+ uaba v22.8h, v1.8h, v29.8h
+ \f v16.4s, v22.8h
+ uabd v23.8h, v2.8h, v30.8h
+ uaba v23.8h, v3.8h, v31.8h
+ \f v17.4s, v23.8h
+ ldp q4, q5, [x0, #64]
+ ldp q6, q7, [x0, #96]
+ ldp q28, q29, [x1, #64]
+ ldp q30, q31, [x1, #96]
+ uabd v24.8h, v4.8h, v28.8h
+ uaba v24.8h, v5.8h, v29.8h
+ uadalp v16.4s, v24.8h
+ uabd v25.8h, v6.8h, v30.8h
+ uaba v25.8h, v7.8h, v31.8h
+ uadalp v17.4s, v25.8h
+
+ ld1 {v28.8h-v31.8h}, [x2]
+ uabd v26.8h, v0.8h, v28.8h
+ uaba v26.8h, v1.8h, v29.8h
+ \f v18.4s, v26.8h
+ uabd v27.8h, v2.8h, v30.8h
+ uaba v27.8h, v3.8h, v31.8h
+ \f v19.4s, v27.8h
+ ldp q28, q29, [x2, #64]
+ ldp q30, q31, [x2, #96]
+ uabd v22.8h, v4.8h, v28.8h
+ uaba v22.8h, v5.8h, v29.8h
+ uadalp v18.4s, v22.8h
+ uabd v23.8h, v6.8h, v30.8h
+ uaba v23.8h, v7.8h, v31.8h
+ uadalp v19.4s, v23.8h
+
+ ld1 {v28.8h-v31.8h}, [x3]
+ uabd v24.8h, v0.8h, v28.8h
+ uaba v24.8h, v1.8h, v29.8h
+ \f v20.4s, v24.8h
+ uabd v25.8h, v2.8h, v30.8h
+ uaba v25.8h, v3.8h, v31.8h
+ \f v21.4s, v25.8h
+ ldp q28, q29, [x3, #64]
+ ldp q30, q31, [x3, #96]
+ uabd v26.8h, v4.8h, v28.8h
+ uaba v26.8h, v5.8h, v29.8h
+ uadalp v20.4s, v26.8h
+ uabd v27.8h, v6.8h, v30.8h
+ uaba v27.8h, v7.8h, v31.8h
+ uadalp v21.4s, v27.8h
+
+ add x0, x0, x6
+ add x1, x1, x4
+ add x2, x2, x4
+ add x3, x3, x4
+.endm
+
+.macro SAD_x3_FUNC_LOOP_LARGE w, h
+function PFX(sad_x3_\w\()x\h\()_neon)
+ // Stride is given in terms of pixel channel size, so double to get number of bytes.
+ add x4, x4, x4
+ mov x6, #(FENC_STRIDE << 1)
+
+ SAD_x3_\w\()_WIDEN uaddlp
+ SAD_x3_\w\()_WIDEN uadalp
+
+ mov w9, #(\h - 2)/2
+.Loop_x_\w\()x\h:
+ sub w9, w9, #1
+.rept 2
+ SAD_x3_\w\()_WIDEN uadalp
+.endr
+ cbnz w9, .Loop_x_\w\()x\h
+
+ add v16.4s, v16.4s, v17.4s
+ add v17.4s, v18.4s, v19.4s
+ add v18.4s, v20.4s, v21.4s
+ addp v0.4s, v16.4s, v17.4s
+ addp v1.4s, v18.4s, v18.4s
+ addp v0.4s, v0.4s, v1.4s
+
+ str d0, [x5]
+ add x5, x5, #8
+ st1 {v0.s}[2], [x5]
+
+ ret
+endfunc
+.endm
+
+SAD_x3_FUNC 4, 4
+SAD_x3_FUNC 4, 8
+SAD_x3_FUNC 4, 16
+SAD_x3_FUNC 8, 4
+SAD_x3_FUNC 8, 8
+SAD_x3_FUNC 8, 16
+SAD_x3_FUNC 8, 32
+SAD_x3_FUNC_LOOP 12, 16
+SAD_x3_FUNC_LOOP 16, 4
+SAD_x3_FUNC_LOOP 16, 8
+SAD_x3_FUNC_LOOP 16, 12
+SAD_x3_FUNC_LOOP 16, 16
+SAD_x3_FUNC_LOOP 32, 8
+SAD_x3_FUNC_LOOP_LARGE 16, 32
+SAD_x3_FUNC_LOOP_LARGE 16, 64
+SAD_x3_FUNC_LOOP_LARGE 24, 32
+SAD_x3_FUNC_LOOP_LARGE 32, 16
+SAD_x3_FUNC_LOOP_LARGE 32, 24
+SAD_x3_FUNC_LOOP_LARGE 32, 32
+SAD_x3_FUNC_LOOP_LARGE 32, 64
+SAD_x3_FUNC_LOOP_LARGE 48, 64
+SAD_x3_FUNC_LOOP_LARGE 64, 16
+SAD_x3_FUNC_LOOP_LARGE 64, 32
+SAD_x3_FUNC_LOOP_LARGE 64, 48
+SAD_x3_FUNC_LOOP_LARGE 64, 64
+
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list