[x265] [PATCH 8/8] AArch64: Add Armv8.4 Neon DotProd implementations of SADxN

Hari Limaye hari.limaye at arm.com
Thu May 23 17:20:44 UTC 2024


Add implementations of sad_x3 and sad_x4 primitives using Neon DotProd
instructions, which are mandatory from Armv8.4.

The UABD, UDOT instruction sequences use wider (32-bit) accumulators
which simplifies the reductions.
---
 source/common/aarch64/asm-primitives.cpp |   2 +
 source/common/aarch64/fun-decls.h        |   2 +
 source/common/aarch64/sad-neon-dotprod.S | 160 +++++++++++++++++++++++
 3 files changed, 164 insertions(+)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 22e812e18..bc0798c4e 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1183,6 +1183,8 @@ void setupSve2Primitives(EncoderPrimitives &)
 void setupNeonDotProdPrimitives(EncoderPrimitives &p)
 {
     LUMA_PU_MULTIPLE_16(sad, pixel_sad, neon_dotprod);
+    LUMA_PU_MULTIPLE_16(sad_x3, sad_x3, neon_dotprod);
+    LUMA_PU_MULTIPLE_16(sad_x4, sad_x4, neon_dotprod);
 }
 #else // !HIGH_BIT_DEPTH
 void setupNeonDotProdPrimitives(EncoderPrimitives &)
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index ad357f245..ba5496032 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -173,6 +173,8 @@ DECLS(sve);
 DECLS(sve2);
 
 FUNCDEF_PU_MULT_16(int, pixel_sad, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
+FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
+FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
 
 void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
 
diff --git a/source/common/aarch64/sad-neon-dotprod.S b/source/common/aarch64/sad-neon-dotprod.S
index 54da6ea26..21ff144f2 100644
--- a/source/common/aarch64/sad-neon-dotprod.S
+++ b/source/common/aarch64/sad-neon-dotprod.S
@@ -140,3 +140,163 @@ SAD_NEON_DOTPROD_LOOP  64, 16
 SAD_NEON_DOTPROD_LOOP  64, 32
 SAD_NEON_DOTPROD_LOOP  64, 48
 SAD_NEON_DOTPROD_LOOP  64, 64
+
+.macro PREP_ARGS_SAD_X_NEON_DOTPROD x
+    mov             x9, #FENC_STRIDE
+
+// Make function arguments for x == 3 look like x == 4.
+.if \x == 3
+    mov             x6, x5
+    mov             x5, x4
+.endif
+
+    // v31: 1 across all lanes for use in UDOT instructions.
+    movi            v31.16b, #1
+.endm
+
+.macro SAD_X_NEON_DOTPROD_START x
+    movi v16.4s, #0
+    movi v17.4s, #0
+    movi v18.4s, #0
+.if \x == 4
+    movi v19.4s, #0
+.endif
+.endm
+
+.macro SAD_X_NEON_DOTPROD_END x
+.if \x == 3
+    addv            s0, v16.4s
+    addv            s1, v17.4s
+    addv            s2, v18.4s
+    stp             s0, s1, [x6]
+    str             s2, [x6, #8]
+.elseif \x == 4
+    addp            v16.4s, v16.4s, v17.4s
+    addp            v18.4s, v18.4s, v19.4s
+    addp            v16.4s, v16.4s, v18.4s
+    str             q16, [x6]
+.endif
+    ret
+.endm
+
+// Fully unrolled.
+.macro SAD_X_NEON_DOTPROD_16 x, h
+function PFX(sad_x\x\()_16x\h\()_neon_dotprod)
+    PREP_ARGS_SAD_X_NEON_DOTPROD \x
+    SAD_X_NEON_DOTPROD_START \x
+.rept \h
+    ld1             {v6.16b}, [x0], x9
+    ld1             {v0.16b}, [x1], x5
+    ld1             {v1.16b}, [x2], x5
+    ld1             {v2.16b}, [x3], x5
+.if \x == 4
+    ld1             {v3.16b}, [x4], x5
+.endif
+    uabd            v20.16b, v0.16b, v6.16b
+    udot            v16.4s, v20.16b, v31.16b
+    uabd            v21.16b, v1.16b, v6.16b
+    udot            v17.4s, v21.16b, v31.16b
+    uabd            v22.16b, v2.16b, v6.16b
+    udot            v18.4s, v22.16b, v31.16b
+.if \x == 4
+    uabd            v23.16b, v3.16b, v6.16b
+    udot            v19.4s, v23.16b, v31.16b
+.endif
+.endr
+    SAD_X_NEON_DOTPROD_END \x
+endfunc
+.endm
+
+.macro SAD_X_NEON_DOTPROD_32 base v1
+    ld1             {v0.16b-v1.16b}, [ \base ], x5
+    uabd            v24.16b, v0.16b, v6.16b
+    udot            \v1\().4s, v24.16b, v31.16b
+    uabd            v25.16b, v1.16b, v7.16b
+    udot            \v1\().4s, v25.16b, v31.16b
+.endm
+
+.macro SAD_X_NEON_DOTPROD_48 base v1
+    ld1             {v0.16b-v2.16b}, [ \base ], x5
+    uabd            v24.16b, v0.16b, v4.16b
+    udot            \v1\().4s, v24.16b, v31.16b
+    uabd            v25.16b, v1.16b, v5.16b
+    udot            \v1\().4s, v25.16b, v31.16b
+    uabd            v26.16b, v2.16b, v6.16b
+    udot            \v1\().4s, v26.16b, v31.16b
+.endm
+
+.macro SAD_X_NEON_DOTPROD_64 base v1
+    ld1             {v0.16b-v3.16b}, [ \base ], x5
+    uabd            v24.16b, v0.16b, v4.16b
+    udot            \v1\().4s, v24.16b, v31.16b
+    uabd            v25.16b, v1.16b, v5.16b
+    udot            \v1\().4s, v25.16b, v31.16b
+    uabd            v26.16b, v2.16b, v6.16b
+    udot            \v1\().4s, v26.16b, v31.16b
+    uabd            v27.16b, v3.16b, v7.16b
+    udot            \v1\().4s, v27.16b, v31.16b
+.endm
+
+// Loop unrolled to process 4 rows per iteration.
+.macro SAD_X_NEON_DOTPROD_LOOP x, w, h
+function PFX(sad_x\x\()_\w\()x\h\()_neon_dotprod)
+    PREP_ARGS_SAD_X_NEON_DOTPROD \x
+    SAD_X_NEON_DOTPROD_START \x
+    mov             w12, #\h/4
+.Loop_sad_x\x\()_\w\()x\h:
+    sub             w12, w12, #1
+ .rept 4
+  .if \w == 16
+    ld1             {v6.16b}, [x0], x9
+  .elseif \w == 32
+    ld1             {v6.16b-v7.16b}, [x0], x9
+  .elseif \w == 48
+    ld1             {v4.16b-v6.16b}, [x0], x9
+  .elseif \w == 64
+    ld1             {v4.16b-v7.16b}, [x0], x9
+  .endif
+    SAD_X_NEON_DOTPROD_\w x1, v16
+    SAD_X_NEON_DOTPROD_\w x2, v17
+    SAD_X_NEON_DOTPROD_\w x3, v18
+  .if \x == 4
+    SAD_X_NEON_DOTPROD_\w x4, v19
+  .endif
+ .endr
+    cbnz            w12, .Loop_sad_x\x\()_\w\()x\h
+    SAD_X_NEON_DOTPROD_END \x
+endfunc
+.endm
+
+SAD_X_NEON_DOTPROD_16 3, 4
+SAD_X_NEON_DOTPROD_16 3, 8
+SAD_X_NEON_DOTPROD_16 3, 12
+SAD_X_NEON_DOTPROD_16 3, 16
+SAD_X_NEON_DOTPROD_16 3, 32
+SAD_X_NEON_DOTPROD_16 3, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 8
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 16
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 24
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 32
+SAD_X_NEON_DOTPROD_LOOP 3, 32, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 48, 64
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 16
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 32
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 48
+SAD_X_NEON_DOTPROD_LOOP 3, 64, 64
+
+SAD_X_NEON_DOTPROD_16 4, 4
+SAD_X_NEON_DOTPROD_16 4, 8
+SAD_X_NEON_DOTPROD_16 4, 12
+SAD_X_NEON_DOTPROD_16 4, 16
+SAD_X_NEON_DOTPROD_16 4, 32
+SAD_X_NEON_DOTPROD_16 4, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 8
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 16
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 24
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 32
+SAD_X_NEON_DOTPROD_LOOP 4, 32, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 48, 64
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 16
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 32
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 48
+SAD_X_NEON_DOTPROD_LOOP 4, 64, 64
-- 
2.42.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0008-AArch64-Add-Armv8.4-Neon-DotProd-implementations-of-.patch
Type: text/x-patch
Size: 7381 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240523/e6d64eda/attachment-0001.bin>


More information about the x265-devel mailing list