[x265] [PATCH v2 8/8] AArch64: Add Armv8.4 Neon DotProd implementations of SADxN
Hari Limaye
hari.limaye at arm.com
Tue Jul 30 15:47:19 UTC 2024
Add implementations of sad_x3 and sad_x4 primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.
The UABD, UDOT instruction sequences use wider (32-bit) accumulators
which simplifies the reductions.
---
source/common/aarch64/asm-primitives.cpp | 2 +
source/common/aarch64/fun-decls.h | 2 +
source/common/aarch64/sad-neon-dotprod.S | 160 +++++++++++++++++++++++
3 files changed, 164 insertions(+)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 705881abf..825aa1c8b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1183,6 +1183,8 @@ void setupSve2Primitives(EncoderPrimitives &)
void setupNeonDotProdPrimitives(EncoderPrimitives &p)
{
LUMA_PU_MULTIPLE_16(sad, pixel_sad, neon_dotprod);
+ LUMA_PU_MULTIPLE_16(sad_x3, sad_x3, neon_dotprod);
+ LUMA_PU_MULTIPLE_16(sad_x4, sad_x4, neon_dotprod);
}
#else // !HIGH_BIT_DEPTH
void setupNeonDotProdPrimitives(EncoderPrimitives &)
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index ad357f245..ba5496032 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -173,6 +173,8 @@ DECLS(sve);
DECLS(sve2);
FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
diff --git a/source/common/aarch64/sad-neon-dotprod.S b/source/common/aarch64/sad-neon-dotprod.S
index c51ddb527..baf90aa7c 100644
--- a/source/common/aarch64/sad-neon-dotprod.S
+++ b/source/common/aarch64/sad-neon-dotprod.S
@@ -168,3 +168,163 @@ SAD_NEON_DOTPROD_LOOP 64, 16
SAD_NEON_DOTPROD_LOOP 64, 32
SAD_NEON_DOTPROD_LOOP 64, 48
SAD_NEON_DOTPROD_LOOP 64, 64
+
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
+ mov x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
+.if \x == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+
+ // v31: 1 across all lanes for use in UDOT instructions.
+ movi v31.16b, #1
+.endm
+
+.macro SAD_X_NEON_DOTPROD_START x
+ movi v16.4s, #0
+ movi v17.4s, #0
+ movi v18.4s, #0
+.if \x == 4
+ movi v19.4s, #0
+.endif
+.endm
+
+.macro SAD_X_NEON_DOTPROD_END x
+.if \x == 3
+ addv s0, v16.4s
+ addv s1, v17.4s
+ addv s2, v18.4s
+ stp s0, s1, [x6]
+ str s2, [x6, #8]
+.elseif \x == 4
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
+ ret
+.endm
+
+// Fully unrolled.
+.macro SAD_X_NEON_DOTPROD_16 x, h
+function PFX(sad_x\x\()_16x\h\()_neon_dotprod)
+ PREP_ARGS_SAD_X_NEON_DOTPROD \x
+ SAD_X_NEON_DOTPROD_START \x
+.rept \h
+ ld1 {v6.16b}, [x0], x9
+ ld1 {v0.16b}, [x1], x5
+ ld1 {v1.16b}, [x2], x5
+ ld1 {v2.16b}, [x3], x5
+.if \x == 4
+ ld1 {v3.16b}, [x4], x5
+.endif
+ uabd v20.16b, v0.16b, v6.16b
+ udot v16.4s, v20.16b, v31.16b
+ uabd v21.16b, v1.16b, v6.16b
+ udot v17.4s, v21.16b, v31.16b
+ uabd v22.16b, v2.16b, v6.16b
+ udot v18.4s, v22.16b, v31.16b
+.if \x == 4
+ uabd v23.16b, v3.16b, v6.16b
+ udot v19.4s, v23.16b, v31.16b
+.endif
+.endr
+ SAD_X_NEON_DOTPROD_END \x
+endfunc
+.endm
+
+.macro SAD_X_NEON_DOTPROD_32 base v1
+ ld1 {v0.16b-v1.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v6.16b
+ udot \v1\().4s, v24.16b, v31.16b
+ uabd v25.16b, v1.16b, v7.16b
+ udot \v1\().4s, v25.16b, v31.16b
+.endm
+
+.macro SAD_X_NEON_DOTPROD_48 base v1
+ ld1 {v0.16b-v2.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ udot \v1\().4s, v24.16b, v31.16b
+ uabd v25.16b, v1.16b, v5.16b
+ udot \v1\().4s, v25.16b, v31.16b
+ uabd v26.16b, v2.16b, v6.16b
+ udot \v1\().4s, v26.16b, v31.16b
+.endm
+
+.macro SAD_X_NEON_DOTPROD_64 base v1
+ ld1 {v0.16b-v3.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ udot \v1\().4s, v24.16b, v31.16b
+ uabd v25.16b, v1.16b, v5.16b
+ udot \v1\().4s, v25.16b, v31.16b
+ uabd v26.16b, v2.16b, v6.16b
+ udot \v1\().4s, v26.16b, v31.16b
+ uabd v27.16b, v3.16b, v7.16b
+ udot \v1\().4s, v27.16b, v31.16b
+.endm
+
+// Loop unrolled to process 4 rows per iteration.
+.macro SAD_X_NEON_DOTPROD_LOOP x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon_dotprod)
+ PREP_ARGS_SAD_X_NEON_DOTPROD \x
+ SAD_X_NEON_DOTPROD_START \x
+ mov w12, #\h/4
+.Loop_sad_x\x\()_\w\()x\h:
+ sub w12, w12, #1
+ .rept 4
+ .if \w == 16
+ ld1 {v6.16b}, [x0], x9
+ .elseif \w == 32
+ ld1 {v6.16b-v7.16b}, [x0], x9
+ .elseif \w == 48
+ ld1 {v4.16b-v6.16b}, [x0], x9
+ .elseif \w == 64
+ ld1 {v4.16b-v7.16b}, [x0], x9
+ .endif
+ SAD_X_NEON_DOTPROD_\w x1, v16
+ SAD_X_NEON_DOTPROD_\w x2, v17
+ SAD_X_NEON_DOTPROD_\w x3, v18
+ .if \x == 4
+ SAD_X_NEON_DOTPROD_\w x4, v19
+ .endif
+ .endr
+ cbnz w12, .Loop_sad_x\x\()_\w\()x\h
+ SAD_X_NEON_DOTPROD_END \x
+endfunc
+.endm
+
+SAD_X_NEON_DOTPROD_16 3, 4
+SAD_X_NEON_DOTPROD_16 3, 8
+SAD_X_NEON_DOTPROD_16 3, 12
+SAD_X_NEON_DOTPROD_16 3, 16
+SAD_X_NEON_DOTPROD_16 3, 32
+SAD_X_NEON_DOTPROD_16 3, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 8
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 16
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 24
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 32
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 48, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 16
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 32
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 48
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 64
+
+SAD_X_NEON_DOTPROD_16 4, 4
+SAD_X_NEON_DOTPROD_16 4, 8
+SAD_X_NEON_DOTPROD_16 4, 12
+SAD_X_NEON_DOTPROD_16 4, 16
+SAD_X_NEON_DOTPROD_16 4, 32
+SAD_X_NEON_DOTPROD_16 4, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 8
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 16
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 24
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 32
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 48, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 16
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 32
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 48
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 64
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0008-AArch64-Add-Armv8.4-Neon-DotProd-implementations-.patch
Type: text/x-patch
Size: 7384 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240730/a61ac94a/attachment-0001.bin>
More information about the x265-devel
mailing list