[x265] [PATCH 08/11] AArch64: Add Neon asm implementation of HBD SSD_S
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Dec 10 16:03:25 UTC 2024
Add a Neon asm implementation of high bitdepth SSD_S functions for
all block sizes. This implementation is up to 50% faster on Neoverse
platforms compared to the existing C implementation.
Change-Id: Iac8c1d5a00b21bb696c5532b71890f963ad2ffc4
---
source/common/aarch64/asm-primitives.cpp | 22 ++--
source/common/aarch64/ssd-a.S | 141 +++++++++++++++++++++++
2 files changed, 152 insertions(+), 11 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index d920adeb4..a9076509c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -552,6 +552,17 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
+ // ssd_s
+ p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon);
+ p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon);
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
+
+ p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon);
+ p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon);
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
+
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -565,17 +576,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // ssd_s
- p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
- p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 5c521c2c3..f4d612137 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -786,4 +786,145 @@ function PFX(pixel_sse_ss_64x64_neon)
ret
endfunc
+function PFX(pixel_ssd_s_4x4_neon)
+ movi v0.4s, #0
+ add x1, x1, x1
+
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal v0.4s, v17.4h, v17.4h
+ add x0, x0, x1, lsl 1
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal v0.4s, v17.4h, v17.4h
+
+ ret_v0_w0
+endfunc
+
+function PFX(pixel_ssd_s_8x8_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+.rept 8
+ ld1 {v16.8h}, [x0], x1
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+.endr
+ add v0.4s, v0.4s, v1.4s
+ ret_v0_w0
+endfunc
+
+function PFX(pixel_ssd_s_16x16_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #16
+.Loop_ssd_s_16:
+ sub w12, w12, #1
+ ld1 {v16.8h-v17.8h}, [x0], x1
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+ smlal v0.4s, v17.4h, v17.4h
+ smlal2 v1.4s, v17.8h, v17.8h
+ cbnz w12, .Loop_ssd_s_16
+
+ add v0.4s, v0.4s, v1.4s
+ uaddlv d0, v0.4s
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_32x32_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ add x1, x1, x1
+
+ mov w12, #32
+.Loop_ssd_s_32:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+ smlal v0.4s, v17.4h, v17.4h
+ smlal2 v1.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #32]
+ smlal v2.4s, v16.4h, v16.4h
+ smlal2 v3.4s, v16.8h, v16.8h
+ smlal v2.4s, v17.4h, v17.4h
+ smlal2 v3.4s, v17.8h, v17.8h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_32
+
+ uaddlp v0.2d, v0.4s
+ uadalp v0.2d, v1.4s
+ uadalp v0.2d, v2.4s
+ uadalp v0.2d, v3.4s
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_64x64_neon)
+ movi v0.4s, #0
+ movi v31.4s, #0
+ add x1, x1, x1
+ add x3, x3, x3
+
+ mov w12, #2
+.Loop_ssd_s_64x32:
+ sub w12, w12, #1
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ movi v4.4s, #0
+ mov w11, #32
+.Loop_ssd_s_64x1:
+ sub w11, w11, #1
+
+ ldp q16, q17, [x0]
+ smlal v1.4s, v16.4h, v16.4h
+ smlal2 v2.4s, v16.8h, v16.8h
+ smlal v1.4s, v17.4h, v17.4h
+ smlal2 v2.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #32]
+ smlal v3.4s, v16.4h, v16.4h
+ smlal2 v4.4s, v16.8h, v16.8h
+ smlal v3.4s, v17.4h, v17.4h
+ smlal2 v4.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #64]
+ smlal v1.4s, v16.4h, v16.4h
+ smlal2 v2.4s, v16.8h, v16.8h
+ smlal v1.4s, v17.4h, v17.4h
+ smlal2 v2.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #96]
+ smlal v3.4s, v16.4h, v16.4h
+ smlal2 v4.4s, v16.8h, v16.8h
+ smlal v3.4s, v17.4h, v17.4h
+ smlal2 v4.4s, v17.8h, v17.8h
+
+ add x0, x0, x1
+ cbnz w11, .Loop_ssd_s_64x1
+
+ uadalp v0.2d, v1.4s
+ uadalp v0.2d, v2.4s
+ uadalp v0.2d, v3.4s
+ uadalp v0.2d, v4.4s
+ cbnz w12, .Loop_ssd_s_64x32
+
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From ba95df30aca3af8ba5536e1654d5c6e7d6a3d118 Mon Sep 17 00:00:00 2001
Message-Id: <ba95df30aca3af8ba5536e1654d5c6e7d6a3d118.1733846134.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1733846134.git.gerdazsejke.more at arm.com>
References: <cover.1733846134.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sat, 7 Dec 2024 11:00:03 +0100
Subject: [PATCH 08/11] AArch64: Add Neon asm implementation of HBD SSD_S
Add a Neon asm implementation of high bitdepth SSD_S functions for
all block sizes. This implementation is up to 50% faster on Neoverse
platforms compared to the existing C implementation.
Change-Id: Iac8c1d5a00b21bb696c5532b71890f963ad2ffc4
---
source/common/aarch64/asm-primitives.cpp | 22 ++--
source/common/aarch64/ssd-a.S | 141 +++++++++++++++++++++++
2 files changed, 152 insertions(+), 11 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index d920adeb4..a9076509c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -552,6 +552,17 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
+ // ssd_s
+ p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon);
+ p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon);
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
+
+ p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon);
+ p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon);
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
+
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
@@ -565,17 +576,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // ssd_s
- p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
- p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 5c521c2c3..f4d612137 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -786,4 +786,145 @@ function PFX(pixel_sse_ss_64x64_neon)
ret
endfunc
+function PFX(pixel_ssd_s_4x4_neon)
+ movi v0.4s, #0
+ add x1, x1, x1
+
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal v0.4s, v17.4h, v17.4h
+ add x0, x0, x1, lsl 1
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal v0.4s, v17.4h, v17.4h
+
+ ret_v0_w0
+endfunc
+
+function PFX(pixel_ssd_s_8x8_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+.rept 8
+ ld1 {v16.8h}, [x0], x1
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+.endr
+ add v0.4s, v0.4s, v1.4s
+ ret_v0_w0
+endfunc
+
+function PFX(pixel_ssd_s_16x16_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #16
+.Loop_ssd_s_16:
+ sub w12, w12, #1
+ ld1 {v16.8h-v17.8h}, [x0], x1
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+ smlal v0.4s, v17.4h, v17.4h
+ smlal2 v1.4s, v17.8h, v17.8h
+ cbnz w12, .Loop_ssd_s_16
+
+ add v0.4s, v0.4s, v1.4s
+ uaddlv d0, v0.4s
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_32x32_neon)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ add x1, x1, x1
+
+ mov w12, #32
+.Loop_ssd_s_32:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ smlal v0.4s, v16.4h, v16.4h
+ smlal2 v1.4s, v16.8h, v16.8h
+ smlal v0.4s, v17.4h, v17.4h
+ smlal2 v1.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #32]
+ smlal v2.4s, v16.4h, v16.4h
+ smlal2 v3.4s, v16.8h, v16.8h
+ smlal v2.4s, v17.4h, v17.4h
+ smlal2 v3.4s, v17.8h, v17.8h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_32
+
+ uaddlp v0.2d, v0.4s
+ uadalp v0.2d, v1.4s
+ uadalp v0.2d, v2.4s
+ uadalp v0.2d, v3.4s
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_64x64_neon)
+ movi v0.4s, #0
+ movi v31.4s, #0
+ add x1, x1, x1
+ add x3, x3, x3
+
+ mov w12, #2
+.Loop_ssd_s_64x32:
+ sub w12, w12, #1
+ movi v1.4s, #0
+ movi v2.4s, #0
+ movi v3.4s, #0
+ movi v4.4s, #0
+ mov w11, #32
+.Loop_ssd_s_64x1:
+ sub w11, w11, #1
+
+ ldp q16, q17, [x0]
+ smlal v1.4s, v16.4h, v16.4h
+ smlal2 v2.4s, v16.8h, v16.8h
+ smlal v1.4s, v17.4h, v17.4h
+ smlal2 v2.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #32]
+ smlal v3.4s, v16.4h, v16.4h
+ smlal2 v4.4s, v16.8h, v16.8h
+ smlal v3.4s, v17.4h, v17.4h
+ smlal2 v4.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #64]
+ smlal v1.4s, v16.4h, v16.4h
+ smlal2 v2.4s, v16.8h, v16.8h
+ smlal v1.4s, v17.4h, v17.4h
+ smlal2 v2.4s, v17.8h, v17.8h
+
+ ldp q16, q17, [x0, #96]
+ smlal v3.4s, v16.4h, v16.4h
+ smlal2 v4.4s, v16.8h, v16.8h
+ smlal v3.4s, v17.4h, v17.4h
+ smlal2 v4.4s, v17.8h, v17.8h
+
+ add x0, x0, x1
+ cbnz w11, .Loop_ssd_s_64x1
+
+ uadalp v0.2d, v1.4s
+ uadalp v0.2d, v2.4s
+ uadalp v0.2d, v3.4s
+ uadalp v0.2d, v4.4s
+ cbnz w12, .Loop_ssd_s_64x32
+
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list