[x265] [PATCH 3/3] AArch64: Add Neon asm implementation of HBD SAD4D
Gerda Zsejke More
gerdazsejke.more at arm.com
Fri Nov 15 11:18:22 UTC 2024
Add a Neon asm implementation of high bitdepth SAD4D functions for
all block sizes. This implementation is 6%-11% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_x4_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 4 +-
source/common/aarch64/sad-a.S | 581 +++++++++++++++--------
2 files changed, 373 insertions(+), 212 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 283256679..0a20085bf 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -527,6 +527,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
+ ALL_LUMA_PU(sad_x4, sad_x4, neon);
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
@@ -541,9 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // sad
- ALL_LUMA_PU(sad_x4, sad_x4, neon);
-
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon);
p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 642fd29f3..bf5495ae4 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -921,46 +921,59 @@ SAD_FUNC_LOOP_LARGE 64, 32
SAD_FUNC_LOOP_LARGE 64, 48
SAD_FUNC_LOOP_LARGE 64, 64
-// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
-.macro SAD_x3_4 f
- ld1 {v0.4h}, [x0], x6
- ld1 {v1.4h}, [x1], x4
- ld1 {v2.4h}, [x2], x4
- ld1 {v3.4h}, [x3], x4
+//void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
+//void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
+.macro SAD_xN_4 n, f
+ ld1 {v0.4h}, [x0], x7
+ ld1 {v1.4h}, [x1], x5
+ ld1 {v2.4h}, [x2], x5
+ ld1 {v3.4h}, [x3], x5
\f v16.4s, v0.4h, v1.4h
\f v17.4s, v0.4h, v2.4h
\f v18.4s, v0.4h, v3.4h
+.if \n == 4
+ ld1 {v4.4h}, [x4], x5
+ \f v19.4s, v0.4h, v4.4h
+.endif
.endm
-.macro SAD_x3_4xH h
- SAD_x3_4 uabdl
+.macro SAD_xN_4xH n, h
+ SAD_xN_4 \n, uabdl
.rept \h - 1
- SAD_x3_4 uabal
+ SAD_xN_4 \n, uabal
.endr
.endm
-.macro SAD_x3_8x2 f
- ld1 {v0.8h}, [x0], x6
- ld1 {v1.8h}, [x1], x4
- ld1 {v2.8h}, [x2], x4
- ld1 {v3.8h}, [x3], x4
+.macro SAD_xN_8x2 n, f
+ ld1 {v0.8h}, [x0], x7
+ ld1 {v1.8h}, [x1], x5
+ ld1 {v2.8h}, [x2], x5
+ ld1 {v3.8h}, [x3], x5
\f v16.8h, v0.8h, v1.8h
\f v17.8h, v0.8h, v2.8h
\f v18.8h, v0.8h, v3.8h
+.if \n == 4
+ ld1 {v4.8h}, [x4], x5
+ \f v22.8h, v0.8h, v4.8h
+.endif
- ld1 {v0.8h}, [x0], x6
- ld1 {v1.8h}, [x1], x4
- ld1 {v2.8h}, [x2], x4
- ld1 {v3.8h}, [x3], x4
+ ld1 {v0.8h}, [x0], x7
+ ld1 {v1.8h}, [x1], x5
+ ld1 {v2.8h}, [x2], x5
+ ld1 {v3.8h}, [x3], x5
\f v19.8h, v0.8h, v1.8h
\f v20.8h, v0.8h, v2.8h
\f v21.8h, v0.8h, v3.8h
+.if \n == 4
+ ld1 {v4.8h}, [x4], x5
+ \f v23.8h, v0.8h, v4.8h
+.endif
.endm
-.macro SAD_x3_8xH h
- SAD_x3_8x2 uabd
-.rept \h/2 - 1
- SAD_x3_8x2 uaba
+.macro SAD_xN_8xH n, h
+ SAD_xN_8x2 \n, uabd
+.rept \h /2 - 1
+ SAD_xN_8x2 \n, uaba
.endr
uaddlp v16.4s, v16.8h
uadalp v16.4s, v19.8h
@@ -968,28 +981,45 @@ SAD_FUNC_LOOP_LARGE 64, 64
uadalp v17.4s, v20.8h
uaddlp v18.4s, v18.8h
uadalp v18.4s, v21.8h
+.if \n == 4
+ uaddlp v19.4s, v22.8h
+ uadalp v19.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_FUNC w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
+.macro SAD_xN_FUNC n, w, h
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w\()xH \h
+ SAD_xN_\w\()xH \n, \h
+.if \n == 3
addp v0.4s, v16.4s, v17.4s
addp v1.4s, v18.4s, v18.4s
addp v0.4s, v0.4s, v1.4s
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-.macro SAD_x3_12 f
+.macro SAD_xN_12 n, f
ldr q0, [x0]
ldr q1, [x1]
ldr q2, [x2]
@@ -1004,57 +1034,82 @@ endfunc
\f v17.8h, v4.8h, v5.8h
\f v19.8h, v4.8h, v6.8h
\f v21.8h, v4.8h, v7.8h
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
+.if \n == 4
+ ldr q3, [x4]
+ ldr d7, [x4, #16]
+ \f v22.8h, v0.8h, v3.8h
+ \f v23.8h, v4.8h, v7.8h
+ add x4, x4, x5
+.endif
.endm
-.macro SAD_x3_16 f
- ld1 {v0.8h-v1.8h}, [x0], x6
- ld1 {v2.8h-v3.8h}, [x1], x4
+.macro SAD_xN_16 n f
+ ld1 {v0.8h-v1.8h}, [x0], x7
+ ld1 {v2.8h-v3.8h}, [x1], x5
\f v16.8h, v0.8h, v2.8h
\f v17.8h, v1.8h, v3.8h
- ld1 {v4.8h-v5.8h}, [x2], x4
+ ld1 {v4.8h-v5.8h}, [x2], x5
\f v18.8h, v0.8h, v4.8h
\f v19.8h, v1.8h, v5.8h
- ld1 {v6.8h-v7.8h}, [x3], x4
+ ld1 {v6.8h-v7.8h}, [x3], x5
\f v20.8h, v0.8h, v6.8h
\f v21.8h, v1.8h, v7.8h
+.if \n == 4
+ ld1 {v6.8h-v7.8h}, [x4], x5
+ \f v22.8h, v0.8h, v6.8h
+ \f v23.8h, v1.8h, v7.8h
+.endif
.endm
-.macro SAD_x3_32 f
- ld1 {v0.8h-v3.8h}, [x0], x6
- ld1 {v4.8h-v7.8h}, [x1], x4
+.macro SAD_xN_32 n f
+ ld1 {v0.8h-v3.8h}, [x0], x7
+ ld1 {v4.8h-v7.8h}, [x1], x5
\f v16.8h, v0.8h, v4.8h
uaba v16.8h, v1.8h, v5.8h
\f v17.8h, v2.8h, v6.8h
uaba v17.8h, v3.8h, v7.8h
- ld1 {v4.8h-v7.8h},[x2], x4
+ ld1 {v4.8h-v7.8h}, [x2], x5
\f v18.8h, v0.8h, v4.8h
uaba v18.8h, v1.8h, v5.8h
\f v19.8h, v2.8h, v6.8h
uaba v19.8h, v3.8h, v7.8h
- ld1 {v4.8h-v7.8h},[x3], x4
+ ld1 {v4.8h-v7.8h}, [x3], x5
\f v20.8h, v0.8h, v4.8h
uaba v20.8h, v1.8h, v5.8h
\f v21.8h, v2.8h, v6.8h
uaba v21.8h, v3.8h, v7.8h
+.if \n == 4
+ ld1 {v4.8h-v7.8h}, [x4], x5
+ \f v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+.endif
.endm
-.macro SAD_x3_FUNC_LOOP w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
+.macro SAD_xN_FUNC_LOOP n, w, h end_type
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w uabd
+ SAD_xN_\w \n, uabd
- mov w9, #\h - 1
-.Loop_x_\w\()x\h:
- sub w9, w9, #1
- SAD_x3_\w uaba
- cbnz w9, .Loop_x_\w\()x\h
+ mov w8, #\h - 1
+.Loop_x\n\()_\w\()x\h:
+ sub w8, w8, #1
+ SAD_xN_\w \n, uaba
+ cbnz w8, .Loop_x\n\()_\w\()x\h
uaddlp v16.4s, v16.8h
uadalp v16.4s, v17.8h
@@ -1062,61 +1117,86 @@ function PFX(sad_x3_\w\()x\h\()_neon)
uadalp v18.4s, v19.8h
uaddlp v20.4s, v20.8h
uadalp v20.4s, v21.8h
+
+.if \n == 3
addp v0.4s, v16.4s, v18.4s
addp v1.4s, v20.4s, v20.4s
addp v0.4s, v0.4s, v1.4s
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ uaddlp v22.4s, v22.8h
+ uadalp v22.4s, v23.8h
+ addp v16.4s, v16.4s, v18.4s
+ addp v20.4s, v20.4s, v22.4s
+ addp v16.4s, v16.4s, v20.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-.macro SAD_x3_16_WIDEN f
- ld1 {v0.8h-v1.8h}, [x0], x6
- ld1 {v2.8h-v3.8h}, [x1], x4
+.macro SAD_xN_16_WIDEN n f
+ ld1 {v0.8h-v1.8h}, [x0], x7
+ ld1 {v2.8h-v3.8h}, [x1], x5
uabd v22.8h, v0.8h, v2.8h
\f v16.4s, v22.8h
uabd v23.8h, v1.8h, v3.8h
\f v17.4s, v23.8h
- ld1 {v4.8h-v5.8h}, [x2], x4
+ ld1 {v4.8h-v5.8h}, [x2], x5
uabd v24.8h, v0.8h, v4.8h
\f v18.4s, v24.8h
uabd v25.8h, v1.8h, v5.8h
\f v19.4s, v25.8h
- ld1 {v6.8h-v7.8h}, [x3], x4
+ ld1 {v6.8h-v7.8h}, [x3], x5
uabd v26.8h, v0.8h, v6.8h
\f v20.4s, v26.8h
uabd v27.8h, v1.8h, v7.8h
\f v21.4s, v27.8h
+.if \n == 4
+ ld1 {v2.8h-v3.8h}, [x4], x5
+ uabd v28.8h, v0.8h, v2.8h
+ \f v30.4s, v28.8h
+ uabd v29.8h, v1.8h, v3.8h
+ \f v31.4s, v29.8h
+.endif
.endm
-.macro SAD_x3_24_WIDEN f
- ld1 {v0.8h-v2.8h}, [x0], x6
- ld1 {v3.8h-v5.8h}, [x1], x4
- uabd v22.8h, v0.8h, v3.8h
- uaba v22.8h, v1.8h, v4.8h
- \f v16.4s, v22.8h
- uabd v23.8h, v2.8h, v5.8h
- \f v17.4s, v23.8h
- ld1 {v28.8h-v30.8h}, [x2], x4
- uabd v24.8h, v0.8h, v28.8h
- uaba v24.8h, v1.8h, v29.8h
- \f v18.4s, v24.8h
- uabd v25.8h, v2.8h, v30.8h
- \f v19.4s, v25.8h
- ld1 {v3.8h-v5.8h}, [x3], x4
- uabd v26.8h, v0.8h, v3.8h
- uaba v26.8h, v1.8h, v4.8h
- \f v20.4s, v26.8h
- uabd v27.8h, v2.8h, v5.8h
- \f v21.4s, v27.8h
+.macro SAD_xN_24_WIDEN n f
+ ld1 {v0.8h-v2.8h}, [x0], x7
+ ld1 {v3.8h-v5.8h}, [x1], x5
+ uabd v6.8h, v0.8h, v3.8h
+ uaba v6.8h, v1.8h, v4.8h
+ \f v16.4s, v6.8h
+ uabd v7.8h, v2.8h, v5.8h
+ \f v17.4s, v7.8h
+ ld1 {v27.8h-v29.8h}, [x2], x5
+ uabd v22.8h, v0.8h, v27.8h
+ uaba v22.8h, v1.8h, v28.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v2.8h, v29.8h
+ \f v19.4s, v23.8h
+ ld1 {v3.8h-v5.8h}, [x3], x5
+ uabd v24.8h, v0.8h, v3.8h
+ uaba v24.8h, v1.8h, v4.8h
+ \f v20.4s, v24.8h
+ uabd v25.8h, v2.8h, v5.8h
+ \f v21.4s, v25.8h
+.if \n == 4
+ ld1 {v27.8h-v29.8h}, [x4], x5
+ uabd v22.8h, v0.8h, v27.8h
+ uaba v22.8h, v1.8h, v28.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v29.8h
+ \f v31.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_32_WIDEN f
- ld1 {v0.8h-v3.8h}, [x0], x6
- ld1 {v4.8h-v7.8h}, [x1], x4
+.macro SAD_xN_32_WIDEN n f
+ ld1 {v0.8h-v3.8h}, [x0], x7
+ ld1 {v4.8h-v7.8h}, [x1], x5
uabd v22.8h, v0.8h, v4.8h
uaba v22.8h, v1.8h, v5.8h
\f v16.4s, v22.8h
@@ -1124,7 +1204,7 @@ endfunc
uaba v23.8h, v3.8h, v7.8h
\f v17.4s, v23.8h
- ld1 {v4.8h-v7.8h}, [x2], x4
+ ld1 {v4.8h-v7.8h}, [x2], x5
uabd v24.8h, v0.8h, v4.8h
uaba v24.8h, v1.8h, v5.8h
\f v18.4s, v24.8h
@@ -1132,174 +1212,257 @@ endfunc
uaba v25.8h, v3.8h, v7.8h
\f v19.4s, v25.8h
- ld1 {v4.8h-v7.8h}, [x3], x4
+ ld1 {v4.8h-v7.8h}, [x3], x5
uabd v26.8h, v0.8h, v4.8h
uaba v26.8h, v1.8h, v5.8h
\f v20.4s, v26.8h
uabd v27.8h, v2.8h, v6.8h
uaba v27.8h, v3.8h, v7.8h
\f v21.4s, v27.8h
+
+.if \n == 4
+ ld1 {v4.8h-v7.8h}, [x4], x5
+ uabd v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+ \f v31.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_48_WIDEN f
+.macro SAD_xN_48_WIDEN n f
ld1 {v0.8h-v3.8h}, [x0]
- ld1 {v28.8h-v31.8h}, [x1]
- uabd v6.8h, v0.8h, v28.8h
- uaba v6.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x1]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
\f v16.4s, v6.8h
- uabd v7.8h, v2.8h, v30.8h
- uaba v7.8h, v3.8h, v31.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
\f v17.4s, v7.8h
ldp q4, q5, [x0, #64]
- ldp q28, q29, [x1, #64]
- uabd v22.8h, v4.8h, v28.8h
- uaba v22.8h, v5.8h, v29.8h
+ ldp q26, q27, [x1, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
uadalp v16.4s, v22.8h
- ld1 {v28.8h-v31.8h}, [x2]
- uabd v23.8h, v0.8h, v28.8h
- uaba v23.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x2]
+ uabd v23.8h, v0.8h, v26.8h
+ uaba v23.8h, v1.8h, v27.8h
\f v18.4s, v23.8h
- uabd v24.8h, v2.8h, v30.8h
- uaba v24.8h, v3.8h, v31.8h
+ uabd v24.8h, v2.8h, v28.8h
+ uaba v24.8h, v3.8h, v29.8h
\f v19.4s, v24.8h
- ldp q28, q29, [x2, #64]
- uabd v25.8h, v4.8h, v28.8h
- uaba v25.8h, v5.8h, v29.8h
+ ldp q26, q27, [x2, #64]
+ uabd v25.8h, v4.8h, v26.8h
+ uaba v25.8h, v5.8h, v27.8h
uadalp v18.4s, v25.8h
- ld1 {v28.8h-v31.8h}, [x3]
- uabd v26.8h, v0.8h, v28.8h
- uaba v26.8h, v1.8h, v29.8h
- \f v20.4s, v26.8h
- uabd v27.8h, v2.8h, v30.8h
- uaba v27.8h, v3.8h, v31.8h
- \f v21.4s, v27.8h
- ldp q28, q29, [x3, #64]
- uabd v6.8h, v4.8h, v28.8h
- uaba v6.8h, v5.8h, v29.8h
- uadalp v20.4s, v6.8h
+ ld1 {v26.8h-v29.8h}, [x3]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
+ \f v20.4s, v6.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
+ \f v21.4s, v7.8h
+ ldp q26, q27, [x3, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
+ uadalp v20.4s, v22.8h
+
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
+.if \n == 4
+ ld1 {v26.8h-v29.8h}, [x4]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
+ \f v30.4s, v6.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
+ \f v31.4s, v7.8h
+ ldp q26, q27, [x4, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
+ uadalp v30.4s, v22.8h
+ add x4, x4, x5
+.endif
.endm
-.macro SAD_x3_64_WIDEN f
+.macro SAD_xN_64_WIDEN n f
ld1 {v0.8h-v3.8h}, [x0]
- ld1 {v28.8h-v31.8h}, [x1]
- uabd v22.8h, v0.8h, v28.8h
- uaba v22.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x1]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
\f v16.4s, v22.8h
- uabd v23.8h, v2.8h, v30.8h
- uaba v23.8h, v3.8h, v31.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
\f v17.4s, v23.8h
ldp q4, q5, [x0, #64]
ldp q6, q7, [x0, #96]
- ldp q28, q29, [x1, #64]
- ldp q30, q31, [x1, #96]
- uabd v24.8h, v4.8h, v28.8h
- uaba v24.8h, v5.8h, v29.8h
+ ldp q26, q27, [x1, #64]
+ ldp q28, q29, [x1, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
uadalp v16.4s, v24.8h
- uabd v25.8h, v6.8h, v30.8h
- uaba v25.8h, v7.8h, v31.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
uadalp v17.4s, v25.8h
- ld1 {v28.8h-v31.8h}, [x2]
- uabd v26.8h, v0.8h, v28.8h
- uaba v26.8h, v1.8h, v29.8h
- \f v18.4s, v26.8h
- uabd v27.8h, v2.8h, v30.8h
- uaba v27.8h, v3.8h, v31.8h
- \f v19.4s, v27.8h
- ldp q28, q29, [x2, #64]
- ldp q30, q31, [x2, #96]
- uabd v22.8h, v4.8h, v28.8h
- uaba v22.8h, v5.8h, v29.8h
- uadalp v18.4s, v22.8h
- uabd v23.8h, v6.8h, v30.8h
- uaba v23.8h, v7.8h, v31.8h
- uadalp v19.4s, v23.8h
+ ld1 {v26.8h-v29.8h}, [x2]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v19.4s, v23.8h
+ ldp q26, q27, [x2, #64]
+ ldp q28, q29, [x2, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v18.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v19.4s, v25.8h
+
+ ld1 {v26.8h-v29.8h}, [x3]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v20.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v21.4s, v23.8h
+ ldp q26, q27, [x3, #64]
+ ldp q28, q29, [x3, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v20.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v21.4s, v25.8h
+
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
+
+.if \n == 4
+ ld1 {v26.8h-v29.8h}, [x4]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v31.4s, v23.8h
+ ldp q26, q27, [x4, #64]
+ ldp q28, q29, [x4, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v30.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v31.4s, v25.8h
+ add x4, x4, x5
+.endif
+.endm
+
+.macro SAD_xN_FUNC_LOOP_LARGE n, w, h
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
- ld1 {v28.8h-v31.8h}, [x3]
- uabd v24.8h, v0.8h, v28.8h
- uaba v24.8h, v1.8h, v29.8h
- \f v20.4s, v24.8h
- uabd v25.8h, v2.8h, v30.8h
- uaba v25.8h, v3.8h, v31.8h
- \f v21.4s, v25.8h
- ldp q28, q29, [x3, #64]
- ldp q30, q31, [x3, #96]
- uabd v26.8h, v4.8h, v28.8h
- uaba v26.8h, v5.8h, v29.8h
- uadalp v20.4s, v26.8h
- uabd v27.8h, v6.8h, v30.8h
- uaba v27.8h, v7.8h, v31.8h
- uadalp v21.4s, v27.8h
-
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
-.endm
-
-.macro SAD_x3_FUNC_LOOP_LARGE w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w\()_WIDEN uaddlp
- SAD_x3_\w\()_WIDEN uadalp
+ SAD_xN_\w\()_WIDEN \n, uaddlp
+ SAD_xN_\w\()_WIDEN \n, uadalp
- mov w9, #(\h - 2)/2
-.Loop_x_\w\()x\h:
- sub w9, w9, #1
+ mov w8, #(\h - 2)/2
+.Loop_x\n\()_\w\()x\h:
+ sub w8, w8, #1
.rept 2
- SAD_x3_\w\()_WIDEN uadalp
+ SAD_xN_\w\()_WIDEN \n, uadalp
.endr
- cbnz w9, .Loop_x_\w\()x\h
+ cbnz w8, .Loop_x\n\()_\w\()x\h
add v16.4s, v16.4s, v17.4s
add v17.4s, v18.4s, v19.4s
add v18.4s, v20.4s, v21.4s
+.if \n == 3
addp v0.4s, v16.4s, v17.4s
addp v1.4s, v18.4s, v18.4s
addp v0.4s, v0.4s, v1.4s
-
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ add v19.4s, v30.4s, v31.4s
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-SAD_x3_FUNC 4, 4
-SAD_x3_FUNC 4, 8
-SAD_x3_FUNC 4, 16
-SAD_x3_FUNC 8, 4
-SAD_x3_FUNC 8, 8
-SAD_x3_FUNC 8, 16
-SAD_x3_FUNC 8, 32
-SAD_x3_FUNC_LOOP 12, 16
-SAD_x3_FUNC_LOOP 16, 4
-SAD_x3_FUNC_LOOP 16, 8
-SAD_x3_FUNC_LOOP 16, 12
-SAD_x3_FUNC_LOOP 16, 16
-SAD_x3_FUNC_LOOP 32, 8
-SAD_x3_FUNC_LOOP_LARGE 16, 32
-SAD_x3_FUNC_LOOP_LARGE 16, 64
-SAD_x3_FUNC_LOOP_LARGE 24, 32
-SAD_x3_FUNC_LOOP_LARGE 32, 16
-SAD_x3_FUNC_LOOP_LARGE 32, 24
-SAD_x3_FUNC_LOOP_LARGE 32, 32
-SAD_x3_FUNC_LOOP_LARGE 32, 64
-SAD_x3_FUNC_LOOP_LARGE 48, 64
-SAD_x3_FUNC_LOOP_LARGE 64, 16
-SAD_x3_FUNC_LOOP_LARGE 64, 32
-SAD_x3_FUNC_LOOP_LARGE 64, 48
-SAD_x3_FUNC_LOOP_LARGE 64, 64
+SAD_xN_FUNC 3, 4, 4
+SAD_xN_FUNC 3, 4, 8
+SAD_xN_FUNC 3, 4, 16
+SAD_xN_FUNC 3, 8, 4
+SAD_xN_FUNC 3, 8, 8
+SAD_xN_FUNC 3, 8, 16
+SAD_xN_FUNC 3, 8, 32
+SAD_xN_FUNC_LOOP 3, 12, 16
+SAD_xN_FUNC_LOOP 3, 16, 4
+SAD_xN_FUNC_LOOP 3, 16, 8
+SAD_xN_FUNC_LOOP 3, 16, 12
+SAD_xN_FUNC_LOOP 3, 16, 16
+SAD_xN_FUNC_LOOP 3, 32, 8
+SAD_xN_FUNC_LOOP_LARGE 3, 16, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 16, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 24, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 16
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 24
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 48, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 16
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 48
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 64
+
+SAD_xN_FUNC 4, 4, 4
+SAD_xN_FUNC 4, 4, 8
+SAD_xN_FUNC 4, 4, 16
+SAD_xN_FUNC 4, 8, 4
+SAD_xN_FUNC 4, 8, 8
+SAD_xN_FUNC 4, 8, 16
+SAD_xN_FUNC 4, 8, 32
+SAD_xN_FUNC_LOOP 4, 12, 16
+SAD_xN_FUNC_LOOP 4, 16, 4
+SAD_xN_FUNC_LOOP 4, 16, 8
+SAD_xN_FUNC_LOOP 4, 16, 12
+SAD_xN_FUNC_LOOP 4, 16, 16
+SAD_xN_FUNC_LOOP 4, 32, 8
+SAD_xN_FUNC_LOOP_LARGE 4, 16, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 16, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 24, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 16
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 24
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 48, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 16
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 48
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 64
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 03a6f9ec9f39a8f4d6f8e7c6e092e90bccefad98 Mon Sep 17 00:00:00 2001
Message-Id: <03a6f9ec9f39a8f4d6f8e7c6e092e90bccefad98.1731667226.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1731667226.git.gerdazsejke.more at arm.com>
References: <cover.1731667226.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Tue, 5 Nov 2024 16:48:23 +0100
Subject: [PATCH 3/3] AArch64: Add Neon asm implementation of HBD SAD4D
Add a Neon asm implementation of high bitdepth SAD4D functions for
all block sizes. This implementation is 6%-11% faster on Neoverse
platforms compared to the existing Neon intrinsics sad_x4_neon<w,h>
implementation.
---
source/common/aarch64/asm-primitives.cpp | 4 +-
source/common/aarch64/sad-a.S | 581 +++++++++++++++--------
2 files changed, 373 insertions(+), 212 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 283256679..0a20085bf 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -527,6 +527,7 @@ void setupNeonPrimitives(EncoderPrimitives &p)
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
+ ALL_LUMA_PU(sad_x4, sad_x4, neon);
#if !HIGH_BIT_DEPTH
// pixel_avg_pp
@@ -541,9 +542,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // sad
- ALL_LUMA_PU(sad_x4, sad_x4, neon);
-
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon);
p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 642fd29f3..bf5495ae4 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -921,46 +921,59 @@ SAD_FUNC_LOOP_LARGE 64, 32
SAD_FUNC_LOOP_LARGE 64, 48
SAD_FUNC_LOOP_LARGE 64, 64
-// void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
-.macro SAD_x3_4 f
- ld1 {v0.4h}, [x0], x6
- ld1 {v1.4h}, [x1], x4
- ld1 {v2.4h}, [x2], x4
- ld1 {v3.4h}, [x3], x4
+//void sad_x3(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res)
+//void sad_x4(const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res)
+.macro SAD_xN_4 n, f
+ ld1 {v0.4h}, [x0], x7
+ ld1 {v1.4h}, [x1], x5
+ ld1 {v2.4h}, [x2], x5
+ ld1 {v3.4h}, [x3], x5
\f v16.4s, v0.4h, v1.4h
\f v17.4s, v0.4h, v2.4h
\f v18.4s, v0.4h, v3.4h
+.if \n == 4
+ ld1 {v4.4h}, [x4], x5
+ \f v19.4s, v0.4h, v4.4h
+.endif
.endm
-.macro SAD_x3_4xH h
- SAD_x3_4 uabdl
+.macro SAD_xN_4xH n, h
+ SAD_xN_4 \n, uabdl
.rept \h - 1
- SAD_x3_4 uabal
+ SAD_xN_4 \n, uabal
.endr
.endm
-.macro SAD_x3_8x2 f
- ld1 {v0.8h}, [x0], x6
- ld1 {v1.8h}, [x1], x4
- ld1 {v2.8h}, [x2], x4
- ld1 {v3.8h}, [x3], x4
+.macro SAD_xN_8x2 n, f
+ ld1 {v0.8h}, [x0], x7
+ ld1 {v1.8h}, [x1], x5
+ ld1 {v2.8h}, [x2], x5
+ ld1 {v3.8h}, [x3], x5
\f v16.8h, v0.8h, v1.8h
\f v17.8h, v0.8h, v2.8h
\f v18.8h, v0.8h, v3.8h
+.if \n == 4
+ ld1 {v4.8h}, [x4], x5
+ \f v22.8h, v0.8h, v4.8h
+.endif
- ld1 {v0.8h}, [x0], x6
- ld1 {v1.8h}, [x1], x4
- ld1 {v2.8h}, [x2], x4
- ld1 {v3.8h}, [x3], x4
+ ld1 {v0.8h}, [x0], x7
+ ld1 {v1.8h}, [x1], x5
+ ld1 {v2.8h}, [x2], x5
+ ld1 {v3.8h}, [x3], x5
\f v19.8h, v0.8h, v1.8h
\f v20.8h, v0.8h, v2.8h
\f v21.8h, v0.8h, v3.8h
+.if \n == 4
+ ld1 {v4.8h}, [x4], x5
+ \f v23.8h, v0.8h, v4.8h
+.endif
.endm
-.macro SAD_x3_8xH h
- SAD_x3_8x2 uabd
-.rept \h/2 - 1
- SAD_x3_8x2 uaba
+.macro SAD_xN_8xH n, h
+ SAD_xN_8x2 \n, uabd
+.rept \h /2 - 1
+ SAD_xN_8x2 \n, uaba
.endr
uaddlp v16.4s, v16.8h
uadalp v16.4s, v19.8h
@@ -968,28 +981,45 @@ SAD_FUNC_LOOP_LARGE 64, 64
uadalp v17.4s, v20.8h
uaddlp v18.4s, v18.8h
uadalp v18.4s, v21.8h
+.if \n == 4
+ uaddlp v19.4s, v22.8h
+ uadalp v19.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_FUNC w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
+.macro SAD_xN_FUNC n, w, h
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w\()xH \h
+ SAD_xN_\w\()xH \n, \h
+.if \n == 3
addp v0.4s, v16.4s, v17.4s
addp v1.4s, v18.4s, v18.4s
addp v0.4s, v0.4s, v1.4s
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-.macro SAD_x3_12 f
+.macro SAD_xN_12 n, f
ldr q0, [x0]
ldr q1, [x1]
ldr q2, [x2]
@@ -1004,57 +1034,82 @@ endfunc
\f v17.8h, v4.8h, v5.8h
\f v19.8h, v4.8h, v6.8h
\f v21.8h, v4.8h, v7.8h
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
+.if \n == 4
+ ldr q3, [x4]
+ ldr d7, [x4, #16]
+ \f v22.8h, v0.8h, v3.8h
+ \f v23.8h, v4.8h, v7.8h
+ add x4, x4, x5
+.endif
.endm
-.macro SAD_x3_16 f
- ld1 {v0.8h-v1.8h}, [x0], x6
- ld1 {v2.8h-v3.8h}, [x1], x4
+.macro SAD_xN_16 n f
+ ld1 {v0.8h-v1.8h}, [x0], x7
+ ld1 {v2.8h-v3.8h}, [x1], x5
\f v16.8h, v0.8h, v2.8h
\f v17.8h, v1.8h, v3.8h
- ld1 {v4.8h-v5.8h}, [x2], x4
+ ld1 {v4.8h-v5.8h}, [x2], x5
\f v18.8h, v0.8h, v4.8h
\f v19.8h, v1.8h, v5.8h
- ld1 {v6.8h-v7.8h}, [x3], x4
+ ld1 {v6.8h-v7.8h}, [x3], x5
\f v20.8h, v0.8h, v6.8h
\f v21.8h, v1.8h, v7.8h
+.if \n == 4
+ ld1 {v6.8h-v7.8h}, [x4], x5
+ \f v22.8h, v0.8h, v6.8h
+ \f v23.8h, v1.8h, v7.8h
+.endif
.endm
-.macro SAD_x3_32 f
- ld1 {v0.8h-v3.8h}, [x0], x6
- ld1 {v4.8h-v7.8h}, [x1], x4
+.macro SAD_xN_32 n f
+ ld1 {v0.8h-v3.8h}, [x0], x7
+ ld1 {v4.8h-v7.8h}, [x1], x5
\f v16.8h, v0.8h, v4.8h
uaba v16.8h, v1.8h, v5.8h
\f v17.8h, v2.8h, v6.8h
uaba v17.8h, v3.8h, v7.8h
- ld1 {v4.8h-v7.8h},[x2], x4
+ ld1 {v4.8h-v7.8h}, [x2], x5
\f v18.8h, v0.8h, v4.8h
uaba v18.8h, v1.8h, v5.8h
\f v19.8h, v2.8h, v6.8h
uaba v19.8h, v3.8h, v7.8h
- ld1 {v4.8h-v7.8h},[x3], x4
+ ld1 {v4.8h-v7.8h}, [x3], x5
\f v20.8h, v0.8h, v4.8h
uaba v20.8h, v1.8h, v5.8h
\f v21.8h, v2.8h, v6.8h
uaba v21.8h, v3.8h, v7.8h
+.if \n == 4
+ ld1 {v4.8h-v7.8h}, [x4], x5
+ \f v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+.endif
.endm
-.macro SAD_x3_FUNC_LOOP w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
+.macro SAD_xN_FUNC_LOOP n, w, h end_type
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w uabd
+ SAD_xN_\w \n, uabd
- mov w9, #\h - 1
-.Loop_x_\w\()x\h:
- sub w9, w9, #1
- SAD_x3_\w uaba
- cbnz w9, .Loop_x_\w\()x\h
+ mov w8, #\h - 1
+.Loop_x\n\()_\w\()x\h:
+ sub w8, w8, #1
+ SAD_xN_\w \n, uaba
+ cbnz w8, .Loop_x\n\()_\w\()x\h
uaddlp v16.4s, v16.8h
uadalp v16.4s, v17.8h
@@ -1062,61 +1117,86 @@ function PFX(sad_x3_\w\()x\h\()_neon)
uadalp v18.4s, v19.8h
uaddlp v20.4s, v20.8h
uadalp v20.4s, v21.8h
+
+.if \n == 3
addp v0.4s, v16.4s, v18.4s
addp v1.4s, v20.4s, v20.4s
addp v0.4s, v0.4s, v1.4s
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ uaddlp v22.4s, v22.8h
+ uadalp v22.4s, v23.8h
+ addp v16.4s, v16.4s, v18.4s
+ addp v20.4s, v20.4s, v22.4s
+ addp v16.4s, v16.4s, v20.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-.macro SAD_x3_16_WIDEN f
- ld1 {v0.8h-v1.8h}, [x0], x6
- ld1 {v2.8h-v3.8h}, [x1], x4
+.macro SAD_xN_16_WIDEN n f
+ ld1 {v0.8h-v1.8h}, [x0], x7
+ ld1 {v2.8h-v3.8h}, [x1], x5
uabd v22.8h, v0.8h, v2.8h
\f v16.4s, v22.8h
uabd v23.8h, v1.8h, v3.8h
\f v17.4s, v23.8h
- ld1 {v4.8h-v5.8h}, [x2], x4
+ ld1 {v4.8h-v5.8h}, [x2], x5
uabd v24.8h, v0.8h, v4.8h
\f v18.4s, v24.8h
uabd v25.8h, v1.8h, v5.8h
\f v19.4s, v25.8h
- ld1 {v6.8h-v7.8h}, [x3], x4
+ ld1 {v6.8h-v7.8h}, [x3], x5
uabd v26.8h, v0.8h, v6.8h
\f v20.4s, v26.8h
uabd v27.8h, v1.8h, v7.8h
\f v21.4s, v27.8h
+.if \n == 4
+ ld1 {v2.8h-v3.8h}, [x4], x5
+ uabd v28.8h, v0.8h, v2.8h
+ \f v30.4s, v28.8h
+ uabd v29.8h, v1.8h, v3.8h
+ \f v31.4s, v29.8h
+.endif
.endm
-.macro SAD_x3_24_WIDEN f
- ld1 {v0.8h-v2.8h}, [x0], x6
- ld1 {v3.8h-v5.8h}, [x1], x4
- uabd v22.8h, v0.8h, v3.8h
- uaba v22.8h, v1.8h, v4.8h
- \f v16.4s, v22.8h
- uabd v23.8h, v2.8h, v5.8h
- \f v17.4s, v23.8h
- ld1 {v28.8h-v30.8h}, [x2], x4
- uabd v24.8h, v0.8h, v28.8h
- uaba v24.8h, v1.8h, v29.8h
- \f v18.4s, v24.8h
- uabd v25.8h, v2.8h, v30.8h
- \f v19.4s, v25.8h
- ld1 {v3.8h-v5.8h}, [x3], x4
- uabd v26.8h, v0.8h, v3.8h
- uaba v26.8h, v1.8h, v4.8h
- \f v20.4s, v26.8h
- uabd v27.8h, v2.8h, v5.8h
- \f v21.4s, v27.8h
+.macro SAD_xN_24_WIDEN n f
+ ld1 {v0.8h-v2.8h}, [x0], x7
+ ld1 {v3.8h-v5.8h}, [x1], x5
+ uabd v6.8h, v0.8h, v3.8h
+ uaba v6.8h, v1.8h, v4.8h
+ \f v16.4s, v6.8h
+ uabd v7.8h, v2.8h, v5.8h
+ \f v17.4s, v7.8h
+ ld1 {v27.8h-v29.8h}, [x2], x5
+ uabd v22.8h, v0.8h, v27.8h
+ uaba v22.8h, v1.8h, v28.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v2.8h, v29.8h
+ \f v19.4s, v23.8h
+ ld1 {v3.8h-v5.8h}, [x3], x5
+ uabd v24.8h, v0.8h, v3.8h
+ uaba v24.8h, v1.8h, v4.8h
+ \f v20.4s, v24.8h
+ uabd v25.8h, v2.8h, v5.8h
+ \f v21.4s, v25.8h
+.if \n == 4
+ ld1 {v27.8h-v29.8h}, [x4], x5
+ uabd v22.8h, v0.8h, v27.8h
+ uaba v22.8h, v1.8h, v28.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v29.8h
+ \f v31.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_32_WIDEN f
- ld1 {v0.8h-v3.8h}, [x0], x6
- ld1 {v4.8h-v7.8h}, [x1], x4
+.macro SAD_xN_32_WIDEN n f
+ ld1 {v0.8h-v3.8h}, [x0], x7
+ ld1 {v4.8h-v7.8h}, [x1], x5
uabd v22.8h, v0.8h, v4.8h
uaba v22.8h, v1.8h, v5.8h
\f v16.4s, v22.8h
@@ -1124,7 +1204,7 @@ endfunc
uaba v23.8h, v3.8h, v7.8h
\f v17.4s, v23.8h
- ld1 {v4.8h-v7.8h}, [x2], x4
+ ld1 {v4.8h-v7.8h}, [x2], x5
uabd v24.8h, v0.8h, v4.8h
uaba v24.8h, v1.8h, v5.8h
\f v18.4s, v24.8h
@@ -1132,174 +1212,257 @@ endfunc
uaba v25.8h, v3.8h, v7.8h
\f v19.4s, v25.8h
- ld1 {v4.8h-v7.8h}, [x3], x4
+ ld1 {v4.8h-v7.8h}, [x3], x5
uabd v26.8h, v0.8h, v4.8h
uaba v26.8h, v1.8h, v5.8h
\f v20.4s, v26.8h
uabd v27.8h, v2.8h, v6.8h
uaba v27.8h, v3.8h, v7.8h
\f v21.4s, v27.8h
+
+.if \n == 4
+ ld1 {v4.8h-v7.8h}, [x4], x5
+ uabd v22.8h, v0.8h, v4.8h
+ uaba v22.8h, v1.8h, v5.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v6.8h
+ uaba v23.8h, v3.8h, v7.8h
+ \f v31.4s, v23.8h
+.endif
.endm
-.macro SAD_x3_48_WIDEN f
+.macro SAD_xN_48_WIDEN n f
ld1 {v0.8h-v3.8h}, [x0]
- ld1 {v28.8h-v31.8h}, [x1]
- uabd v6.8h, v0.8h, v28.8h
- uaba v6.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x1]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
\f v16.4s, v6.8h
- uabd v7.8h, v2.8h, v30.8h
- uaba v7.8h, v3.8h, v31.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
\f v17.4s, v7.8h
ldp q4, q5, [x0, #64]
- ldp q28, q29, [x1, #64]
- uabd v22.8h, v4.8h, v28.8h
- uaba v22.8h, v5.8h, v29.8h
+ ldp q26, q27, [x1, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
uadalp v16.4s, v22.8h
- ld1 {v28.8h-v31.8h}, [x2]
- uabd v23.8h, v0.8h, v28.8h
- uaba v23.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x2]
+ uabd v23.8h, v0.8h, v26.8h
+ uaba v23.8h, v1.8h, v27.8h
\f v18.4s, v23.8h
- uabd v24.8h, v2.8h, v30.8h
- uaba v24.8h, v3.8h, v31.8h
+ uabd v24.8h, v2.8h, v28.8h
+ uaba v24.8h, v3.8h, v29.8h
\f v19.4s, v24.8h
- ldp q28, q29, [x2, #64]
- uabd v25.8h, v4.8h, v28.8h
- uaba v25.8h, v5.8h, v29.8h
+ ldp q26, q27, [x2, #64]
+ uabd v25.8h, v4.8h, v26.8h
+ uaba v25.8h, v5.8h, v27.8h
uadalp v18.4s, v25.8h
- ld1 {v28.8h-v31.8h}, [x3]
- uabd v26.8h, v0.8h, v28.8h
- uaba v26.8h, v1.8h, v29.8h
- \f v20.4s, v26.8h
- uabd v27.8h, v2.8h, v30.8h
- uaba v27.8h, v3.8h, v31.8h
- \f v21.4s, v27.8h
- ldp q28, q29, [x3, #64]
- uabd v6.8h, v4.8h, v28.8h
- uaba v6.8h, v5.8h, v29.8h
- uadalp v20.4s, v6.8h
+ ld1 {v26.8h-v29.8h}, [x3]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
+ \f v20.4s, v6.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
+ \f v21.4s, v7.8h
+ ldp q26, q27, [x3, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
+ uadalp v20.4s, v22.8h
+
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
+.if \n == 4
+ ld1 {v26.8h-v29.8h}, [x4]
+ uabd v6.8h, v0.8h, v26.8h
+ uaba v6.8h, v1.8h, v27.8h
+ \f v30.4s, v6.8h
+ uabd v7.8h, v2.8h, v28.8h
+ uaba v7.8h, v3.8h, v29.8h
+ \f v31.4s, v7.8h
+ ldp q26, q27, [x4, #64]
+ uabd v22.8h, v4.8h, v26.8h
+ uaba v22.8h, v5.8h, v27.8h
+ uadalp v30.4s, v22.8h
+ add x4, x4, x5
+.endif
.endm
-.macro SAD_x3_64_WIDEN f
+.macro SAD_xN_64_WIDEN n f
ld1 {v0.8h-v3.8h}, [x0]
- ld1 {v28.8h-v31.8h}, [x1]
- uabd v22.8h, v0.8h, v28.8h
- uaba v22.8h, v1.8h, v29.8h
+ ld1 {v26.8h-v29.8h}, [x1]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
\f v16.4s, v22.8h
- uabd v23.8h, v2.8h, v30.8h
- uaba v23.8h, v3.8h, v31.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
\f v17.4s, v23.8h
ldp q4, q5, [x0, #64]
ldp q6, q7, [x0, #96]
- ldp q28, q29, [x1, #64]
- ldp q30, q31, [x1, #96]
- uabd v24.8h, v4.8h, v28.8h
- uaba v24.8h, v5.8h, v29.8h
+ ldp q26, q27, [x1, #64]
+ ldp q28, q29, [x1, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
uadalp v16.4s, v24.8h
- uabd v25.8h, v6.8h, v30.8h
- uaba v25.8h, v7.8h, v31.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
uadalp v17.4s, v25.8h
- ld1 {v28.8h-v31.8h}, [x2]
- uabd v26.8h, v0.8h, v28.8h
- uaba v26.8h, v1.8h, v29.8h
- \f v18.4s, v26.8h
- uabd v27.8h, v2.8h, v30.8h
- uaba v27.8h, v3.8h, v31.8h
- \f v19.4s, v27.8h
- ldp q28, q29, [x2, #64]
- ldp q30, q31, [x2, #96]
- uabd v22.8h, v4.8h, v28.8h
- uaba v22.8h, v5.8h, v29.8h
- uadalp v18.4s, v22.8h
- uabd v23.8h, v6.8h, v30.8h
- uaba v23.8h, v7.8h, v31.8h
- uadalp v19.4s, v23.8h
+ ld1 {v26.8h-v29.8h}, [x2]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v18.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v19.4s, v23.8h
+ ldp q26, q27, [x2, #64]
+ ldp q28, q29, [x2, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v18.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v19.4s, v25.8h
+
+ ld1 {v26.8h-v29.8h}, [x3]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v20.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v21.4s, v23.8h
+ ldp q26, q27, [x3, #64]
+ ldp q28, q29, [x3, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v20.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v21.4s, v25.8h
+
+ add x0, x0, x7
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
+
+.if \n == 4
+ ld1 {v26.8h-v29.8h}, [x4]
+ uabd v22.8h, v0.8h, v26.8h
+ uaba v22.8h, v1.8h, v27.8h
+ \f v30.4s, v22.8h
+ uabd v23.8h, v2.8h, v28.8h
+ uaba v23.8h, v3.8h, v29.8h
+ \f v31.4s, v23.8h
+ ldp q26, q27, [x4, #64]
+ ldp q28, q29, [x4, #96]
+ uabd v24.8h, v4.8h, v26.8h
+ uaba v24.8h, v5.8h, v27.8h
+ uadalp v30.4s, v24.8h
+ uabd v25.8h, v6.8h, v28.8h
+ uaba v25.8h, v7.8h, v29.8h
+ uadalp v31.4s, v25.8h
+ add x4, x4, x5
+.endif
+.endm
+
+.macro SAD_xN_FUNC_LOOP_LARGE n, w, h
+function PFX(sad_x\n\()_\w\()x\h\()_neon)
+ // Make function arguments for n == 3 look like n == 4.
+.if \n == 3
+ mov x6, x5
+ mov x5, x4
+.endif
- ld1 {v28.8h-v31.8h}, [x3]
- uabd v24.8h, v0.8h, v28.8h
- uaba v24.8h, v1.8h, v29.8h
- \f v20.4s, v24.8h
- uabd v25.8h, v2.8h, v30.8h
- uaba v25.8h, v3.8h, v31.8h
- \f v21.4s, v25.8h
- ldp q28, q29, [x3, #64]
- ldp q30, q31, [x3, #96]
- uabd v26.8h, v4.8h, v28.8h
- uaba v26.8h, v5.8h, v29.8h
- uadalp v20.4s, v26.8h
- uabd v27.8h, v6.8h, v30.8h
- uaba v27.8h, v7.8h, v31.8h
- uadalp v21.4s, v27.8h
-
- add x0, x0, x6
- add x1, x1, x4
- add x2, x2, x4
- add x3, x3, x4
-.endm
-
-.macro SAD_x3_FUNC_LOOP_LARGE w, h
-function PFX(sad_x3_\w\()x\h\()_neon)
// Stride is given in terms of pixel channel size, so double to get number of bytes.
- add x4, x4, x4
- mov x6, #(FENC_STRIDE << 1)
+ add x5, x5, x5
+ mov x7, #(FENC_STRIDE << 1)
- SAD_x3_\w\()_WIDEN uaddlp
- SAD_x3_\w\()_WIDEN uadalp
+ SAD_xN_\w\()_WIDEN \n, uaddlp
+ SAD_xN_\w\()_WIDEN \n, uadalp
- mov w9, #(\h - 2)/2
-.Loop_x_\w\()x\h:
- sub w9, w9, #1
+ mov w8, #(\h - 2)/2
+.Loop_x\n\()_\w\()x\h:
+ sub w8, w8, #1
.rept 2
- SAD_x3_\w\()_WIDEN uadalp
+ SAD_xN_\w\()_WIDEN \n, uadalp
.endr
- cbnz w9, .Loop_x_\w\()x\h
+ cbnz w8, .Loop_x\n\()_\w\()x\h
add v16.4s, v16.4s, v17.4s
add v17.4s, v18.4s, v19.4s
add v18.4s, v20.4s, v21.4s
+.if \n == 3
addp v0.4s, v16.4s, v17.4s
addp v1.4s, v18.4s, v18.4s
addp v0.4s, v0.4s, v1.4s
-
- str d0, [x5]
- add x5, x5, #8
- st1 {v0.s}[2], [x5]
+ str d0, [x6]
+ add x6, x6, #8
+ st1 {v0.s}[2], [x6]
+.else
+ add v19.4s, v30.4s, v31.4s
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
ret
endfunc
.endm
-SAD_x3_FUNC 4, 4
-SAD_x3_FUNC 4, 8
-SAD_x3_FUNC 4, 16
-SAD_x3_FUNC 8, 4
-SAD_x3_FUNC 8, 8
-SAD_x3_FUNC 8, 16
-SAD_x3_FUNC 8, 32
-SAD_x3_FUNC_LOOP 12, 16
-SAD_x3_FUNC_LOOP 16, 4
-SAD_x3_FUNC_LOOP 16, 8
-SAD_x3_FUNC_LOOP 16, 12
-SAD_x3_FUNC_LOOP 16, 16
-SAD_x3_FUNC_LOOP 32, 8
-SAD_x3_FUNC_LOOP_LARGE 16, 32
-SAD_x3_FUNC_LOOP_LARGE 16, 64
-SAD_x3_FUNC_LOOP_LARGE 24, 32
-SAD_x3_FUNC_LOOP_LARGE 32, 16
-SAD_x3_FUNC_LOOP_LARGE 32, 24
-SAD_x3_FUNC_LOOP_LARGE 32, 32
-SAD_x3_FUNC_LOOP_LARGE 32, 64
-SAD_x3_FUNC_LOOP_LARGE 48, 64
-SAD_x3_FUNC_LOOP_LARGE 64, 16
-SAD_x3_FUNC_LOOP_LARGE 64, 32
-SAD_x3_FUNC_LOOP_LARGE 64, 48
-SAD_x3_FUNC_LOOP_LARGE 64, 64
+SAD_xN_FUNC 3, 4, 4
+SAD_xN_FUNC 3, 4, 8
+SAD_xN_FUNC 3, 4, 16
+SAD_xN_FUNC 3, 8, 4
+SAD_xN_FUNC 3, 8, 8
+SAD_xN_FUNC 3, 8, 16
+SAD_xN_FUNC 3, 8, 32
+SAD_xN_FUNC_LOOP 3, 12, 16
+SAD_xN_FUNC_LOOP 3, 16, 4
+SAD_xN_FUNC_LOOP 3, 16, 8
+SAD_xN_FUNC_LOOP 3, 16, 12
+SAD_xN_FUNC_LOOP 3, 16, 16
+SAD_xN_FUNC_LOOP 3, 32, 8
+SAD_xN_FUNC_LOOP_LARGE 3, 16, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 16, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 24, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 16
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 24
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 32, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 48, 64
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 16
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 32
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 48
+SAD_xN_FUNC_LOOP_LARGE 3, 64, 64
+
+SAD_xN_FUNC 4, 4, 4
+SAD_xN_FUNC 4, 4, 8
+SAD_xN_FUNC 4, 4, 16
+SAD_xN_FUNC 4, 8, 4
+SAD_xN_FUNC 4, 8, 8
+SAD_xN_FUNC 4, 8, 16
+SAD_xN_FUNC 4, 8, 32
+SAD_xN_FUNC_LOOP 4, 12, 16
+SAD_xN_FUNC_LOOP 4, 16, 4
+SAD_xN_FUNC_LOOP 4, 16, 8
+SAD_xN_FUNC_LOOP 4, 16, 12
+SAD_xN_FUNC_LOOP 4, 16, 16
+SAD_xN_FUNC_LOOP 4, 32, 8
+SAD_xN_FUNC_LOOP_LARGE 4, 16, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 16, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 24, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 16
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 24
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 32, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 48, 64
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 16
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 32
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 48
+SAD_xN_FUNC_LOOP_LARGE 4, 64, 64
#endif // !HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list