[x265] [PATCH 09/11] AArch64: Add SVE asm implementation of HBD SSD_S
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Dec 10 16:03:46 UTC 2024
Add an SVE asm implementation of high bitdepth SSD_S functions for
all block sizes. This implementation is 42-45% faster on Neoverse
platforms compared to the existing Neon asm implementation.
Change-Id: Ibedb5fa7f30c88523fb0388ccaf24a8f3ae87a06
---
source/common/aarch64/asm-primitives.cpp | 14 +++
source/common/aarch64/ssd-a-sve.S | 112 +++++++++++++++++++++++
2 files changed, 126 insertions(+)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index a9076509c..f88fdc000 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -879,6 +879,20 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve);
+
+ // ssd_s
+ p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve);
+ p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve);
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve);
+ p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve);
+
+ p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve);
+ p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve);
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve);
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve);
+ p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve);
+
#endif // !HIGH_BIT_DEPTH
}
#endif // defined(HAVE_SVE2) || defined(HAVE_SVE)
diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S
index c1f745947..dbb750e17 100644
--- a/source/common/aarch64/ssd-a-sve.S
+++ b/source/common/aarch64/ssd-a-sve.S
@@ -213,6 +213,118 @@ function PFX(pixel_sse_pp_64x64_sve)
ret
endfunc
+function PFX(pixel_ssd_s_4x4_sve)
+ movi v0.4s, #0
+ add x1, x1, x1
+
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ sdot z0.d, z16.h, z16.h
+ sdot z0.d, z17.h, z17.h
+ add x0, x0, x1, lsl #1
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ sdot z0.d, z16.h, z16.h
+ sdot z0.d, z17.h, z17.h
+
+ fmov w0, s0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_8x8_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+.rept 4
+ ld1 {v16.8h}, [x0], x1
+ sdot z0.d, z16.h, z16.h
+ ld1 {v17.8h}, [x0], x1
+ sdot z1.d, z17.h, z17.h
+.endr
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov w0, s0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_16x16_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #16
+.Loop_ssd_s_16:
+ sub w12, w12, #1
+
+ ld1 {v16.8h-v17.8h}, [x0], x1
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ cbnz w12, .Loop_ssd_s_16
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_32x32_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #32
+.Loop_ssd_s_32:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #32]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_32
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_64x64_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #64
+.Loop_ssd_s_64:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #32]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #64]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #96]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_64
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
#endif // HIGH_BIT_DEPTH
.macro SSE_SS_4x2
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 61fb770f867aa547b41ad0adafa82e3135e19017 Mon Sep 17 00:00:00 2001
Message-Id: <61fb770f867aa547b41ad0adafa82e3135e19017.1733846134.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1733846134.git.gerdazsejke.more at arm.com>
References: <cover.1733846134.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sat, 7 Dec 2024 13:05:13 +0100
Subject: [PATCH 09/11] AArch64: Add SVE asm implementation of HBD SSD_S
Add an SVE asm implementation of high bitdepth SSD_S functions for
all block sizes. This implementation is 42-45% faster on Neoverse
platforms compared to the existing Neon asm implementation.
Change-Id: Ibedb5fa7f30c88523fb0388ccaf24a8f3ae87a06
---
source/common/aarch64/asm-primitives.cpp | 14 +++
source/common/aarch64/ssd-a-sve.S | 112 +++++++++++++++++++++++
2 files changed, 126 insertions(+)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index a9076509c..f88fdc000 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -879,6 +879,20 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve);
+
+ // ssd_s
+ p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve);
+ p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve);
+ p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve);
+ p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve);
+ p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve);
+
+ p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve);
+ p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve);
+ p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve);
+ p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve);
+ p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve);
+
#endif // !HIGH_BIT_DEPTH
}
#endif // defined(HAVE_SVE2) || defined(HAVE_SVE)
diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S
index c1f745947..dbb750e17 100644
--- a/source/common/aarch64/ssd-a-sve.S
+++ b/source/common/aarch64/ssd-a-sve.S
@@ -213,6 +213,118 @@ function PFX(pixel_sse_pp_64x64_sve)
ret
endfunc
+function PFX(pixel_ssd_s_4x4_sve)
+ movi v0.4s, #0
+ add x1, x1, x1
+
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ sdot z0.d, z16.h, z16.h
+ sdot z0.d, z17.h, z17.h
+ add x0, x0, x1, lsl #1
+ ldr d16, [x0]
+ ldr d17, [x0, x1]
+ sdot z0.d, z16.h, z16.h
+ sdot z0.d, z17.h, z17.h
+
+ fmov w0, s0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_8x8_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+.rept 4
+ ld1 {v16.8h}, [x0], x1
+ sdot z0.d, z16.h, z16.h
+ ld1 {v17.8h}, [x0], x1
+ sdot z1.d, z17.h, z17.h
+.endr
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov w0, s0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_16x16_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #16
+.Loop_ssd_s_16:
+ sub w12, w12, #1
+
+ ld1 {v16.8h-v17.8h}, [x0], x1
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ cbnz w12, .Loop_ssd_s_16
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_32x32_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #32
+.Loop_ssd_s_32:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #32]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_32
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
+function PFX(pixel_ssd_s_64x64_sve)
+ movi v0.4s, #0
+ movi v1.4s, #0
+ add x1, x1, x1
+
+ mov w12, #64
+.Loop_ssd_s_64:
+ sub w12, w12, #1
+
+ ldp q16, q17, [x0]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #32]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #64]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+ ldp q16, q17, [x0, #96]
+ sdot z0.d, z16.h, z16.h
+ sdot z1.d, z17.h, z17.h
+
+ add x0, x0, x1
+ cbnz w12, .Loop_ssd_s_64
+
+ add v0.2d, v0.2d, v1.2d
+ addp d0, v0.2d
+ fmov x0, d0
+ ret
+endfunc
+
#endif // HIGH_BIT_DEPTH
.macro SSE_SS_4x2
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list