[x265] [PATCH 3/8] AArch64: Remove SVE2 SAD/SADxN primitives
Hari Limaye
hari.limaye at arm.com
Thu May 23 17:18:33 UTC 2024
Remove the SVE2 implementations of pixel_sad, sad_x3 and sad_x4
primitives. These functions simply call the Neon implementations when
the SVE vector length is not greater than 128 bits. There are no SVE2
CPUs with vector length greater than 128 bits.
---
source/common/CMakeLists.txt | 4 +-
source/common/aarch64/asm-primitives.cpp | 6 +-
source/common/aarch64/sad-a-common.S | 476 ---------------------
source/common/aarch64/sad-a-sve2.S | 517 -----------------------
source/common/aarch64/sad-a.S | 439 ++++++++++++++++++-
5 files changed, 443 insertions(+), 999 deletions(-)
delete mode 100644 source/common/aarch64/sad-a-common.S
delete mode 100644 source/common/aarch64/sad-a-sve2.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 7d0506909..7b01c3b6b 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -109,9 +109,9 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
enable_language(ASM)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S sad-a-common.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
- set(A_SRCS_SVE2 mc-a-sve2.S sad-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
+ set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 356901dd9..66c241c37 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1673,9 +1673,9 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
// sad
- ALL_LUMA_PU(sad, pixel_sad, sve2);
- ALL_LUMA_PU(sad_x3, sad_x3, sve2);
- ALL_LUMA_PU(sad_x4, sad_x4, sve2);
+ ALL_LUMA_PU(sad, pixel_sad, neon);
+ ALL_LUMA_PU(sad_x3, sad_x3, neon);
+ ALL_LUMA_PU(sad_x4, sad_x4, neon);
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
diff --git a/source/common/aarch64/sad-a-common.S b/source/common/aarch64/sad-a-common.S
deleted file mode 100644
index a94607369..000000000
--- a/source/common/aarch64/sad-a-common.S
+++ /dev/null
@@ -1,476 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2024 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- Hari Limaye <hari.limaye at arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch armv8-a
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.macro SAD_START_4 f
- ldr s0, [x0]
- ldr s1, [x2]
- add x0, x0, x1
- add x2, x2, x3
- ld1 {v0.s}[1], [x0], x1
- ld1 {v1.s}[1], [x2], x3
- \f v16.8h, v0.8b, v1.8b
-.endm
-
-.macro SAD_4 h
-.rept \h / 2 - 1
- SAD_START_4 uabal
-.endr
-.endm
-
-.macro SAD_START_8 f
- ld1 {v0.8b}, [x0], x1
- ld1 {v1.8b}, [x2], x3
- ld1 {v2.8b}, [x0], x1
- ld1 {v3.8b}, [x2], x3
- \f v16.8h, v0.8b, v1.8b
- \f v17.8h, v2.8b, v3.8b
-.endm
-
-.macro SAD_8 h
-.rept \h / 2 - 1
- SAD_START_8 uabal
-.endr
-.endm
-
-.macro SAD_START_16
- movi v16.16b, #0
- movi v17.16b, #0
-.endm
-
-.macro SAD_16
- ld1 {v0.16b}, [x0], x1
- ld1 {v1.16b}, [x2], x3
- ld1 {v2.16b}, [x0], x1
- ld1 {v3.16b}, [x2], x3
- uabd v20.16b, v0.16b, v1.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v2.16b, v3.16b
- uadalp v17.8h, v21.16b
-.endm
-
-.macro SAD_END_16
- add v16.8h, v16.8h, v17.8h
- uaddlv s0, v16.8h
- fmov x0, d0
- ret
-.endm
-
-.macro SAD_START_32
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v19.16b, #0
-.endm
-
-.macro SAD_32
- ld1 {v0.16b-v1.16b}, [x0], x1
- ld1 {v2.16b-v3.16b}, [x2], x3
- ld1 {v4.16b-v5.16b}, [x0], x1
- ld1 {v6.16b-v7.16b}, [x2], x3
- uabd v20.16b, v0.16b, v2.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v1.16b, v3.16b
- uadalp v17.8h, v21.16b
- uabd v22.16b, v4.16b, v6.16b
- uadalp v18.8h, v22.16b
- uabd v23.16b, v5.16b, v7.16b
- uadalp v19.8h, v23.16b
-.endm
-
-.macro SAD_END_32
- add v16.8h, v16.8h, v17.8h
- add v17.8h, v18.8h, v19.8h
- add v16.8h, v16.8h, v17.8h
- uaddlv s0, v16.8h
- fmov w0, s0
- ret
-.endm
-
-.macro SAD_START_64
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v19.16b, #0
-.endm
-
-.macro SAD_64
- ld1 {v0.16b-v3.16b}, [x0], x1
- ld1 {v4.16b-v7.16b}, [x2], x3
- ld1 {v24.16b-v27.16b}, [x0], x1
- ld1 {v28.16b-v31.16b}, [x2], x3
- uabd v20.16b, v0.16b, v4.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v1.16b, v5.16b
- uadalp v17.8h, v21.16b
- uabd v22.16b, v2.16b, v6.16b
- uadalp v18.8h, v22.16b
- uabd v23.16b, v3.16b, v7.16b
- uadalp v19.8h, v23.16b
- uabd v20.16b, v24.16b, v28.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v25.16b, v29.16b
- uadalp v17.8h, v21.16b
- uabd v22.16b, v26.16b, v30.16b
- uadalp v18.8h, v22.16b
- uabd v23.16b, v27.16b, v31.16b
- uadalp v19.8h, v23.16b
-.endm
-
-.macro SAD_END_64
- uaddlp v16.4s, v16.8h
- uadalp v16.4s, v17.8h
- uadalp v16.4s, v18.8h
- uadalp v16.4s, v19.8h
- uaddlv d0, v16.4s
- fmov x0, d0
- ret
-.endm
-
-.macro SAD_START_12
- movrel x12, sad12_mask
- ld1 {v31.16b}, [x12]
- movi v16.16b, #0
- movi v17.16b, #0
-.endm
-
-.macro SAD_12
- ld1 {v0.16b}, [x0], x1
- and v0.16b, v0.16b, v31.16b
- ld1 {v1.16b}, [x2], x3
- and v1.16b, v1.16b, v31.16b
- ld1 {v2.16b}, [x0], x1
- and v2.16b, v2.16b, v31.16b
- ld1 {v3.16b}, [x2], x3
- and v3.16b, v3.16b, v31.16b
- uabd v20.16b, v0.16b, v1.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v2.16b, v3.16b
- uadalp v17.8h, v21.16b
-.endm
-
-.macro SAD_END_12
- add v16.8h, v16.8h, v17.8h
- uaddlv s0, v16.8h
- fmov w0, s0
- ret
-.endm
-
-.macro SAD_START_24
- movi v16.16b, #0
- movi v17.16b, #0
- sub x1, x1, #16
- sub x3, x3, #16
-.endm
-
-.macro SAD_24
- ld1 {v0.16b}, [x0], #16
- ld1 {v1.8b}, [x0], x1
- ld1 {v2.16b}, [x2], #16
- ld1 {v3.8b}, [x2], x3
- ld1 {v4.16b}, [x0], #16
- ld1 {v5.8b}, [x0], x1
- ld1 {v6.16b}, [x2], #16
- ld1 {v7.8b}, [x2], x3
- uabd v20.16b, v0.16b, v2.16b
- uadalp v16.8h, v20.16b
- uabal v17.8h, v1.8b, v3.8b
- uabd v20.16b, v4.16b, v6.16b
- uadalp v16.8h, v20.16b
- uabal v17.8h, v5.8b, v7.8b
-.endm
-
-.macro SAD_END_24
- add v16.8h, v16.8h, v17.8h
- uaddlv s0, v16.8h
- fmov w0, s0
- ret
-.endm
-
-.macro SAD_START_48
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
-.endm
-
-.macro SAD_48
- ld1 {v0.16b-v2.16b}, [x0], x1
- ld1 {v4.16b-v6.16b}, [x2], x3
- ld1 {v24.16b-v26.16b}, [x0], x1
- ld1 {v28.16b-v30.16b}, [x2], x3
- uabd v20.16b, v0.16b, v4.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v1.16b, v5.16b
- uadalp v17.8h, v21.16b
- uabd v22.16b, v2.16b, v6.16b
- uadalp v18.8h, v22.16b
- uabd v20.16b, v24.16b, v28.16b
- uadalp v16.8h, v20.16b
- uabd v21.16b, v25.16b, v29.16b
- uadalp v17.8h, v21.16b
- uabd v22.16b, v26.16b, v30.16b
- uadalp v18.8h, v22.16b
-.endm
-
-.macro SAD_END_48
- uaddlp v16.4s, v16.8h
- uadalp v16.4s, v17.8h
- uadalp v16.4s, v18.8h
- uaddlv d0, v16.4s
- fmov x0, d0
- ret
-.endm
-
-.macro SAD_X_START_4 h, x, f
- ldr s0, [x0]
- ldr s1, [x1]
- ldr s2, [x2]
- ldr s3, [x3]
- add x0, x0, x9
- add x1, x1, x5
- add x2, x2, x5
- add x3, x3, x5
- ld1 {v0.s}[1], [x0], x9
- ld1 {v1.s}[1], [x1], x5
- ld1 {v2.s}[1], [x2], x5
- ld1 {v3.s}[1], [x3], x5
- \f v16.8h, v0.8b, v1.8b
- \f v17.8h, v0.8b, v2.8b
- \f v18.8h, v0.8b, v3.8b
-.if \x == 4
- ldr s4, [x4]
- add x4, x4, x5
- ld1 {v4.s}[1], [x4], x5
- \f v19.8h, v0.8b, v4.8b
-.endif
-.endm
-
-.macro SAD_X_4 h, x
-.rept \h/2 - 1
- SAD_X_START_4 \h, \x, uabal
-.endr
-.endm
-
-.macro SAD_X_END_4 x
- uaddlv s0, v16.8h
- uaddlv s1, v17.8h
- uaddlv s2, v18.8h
- stp s0, s1, [x6]
-.if \x == 3
- str s2, [x6, #8]
-.elseif \x == 4
- uaddlv s3, v19.8h
- stp s2, s3, [x6, #8]
-.endif
- ret
-.endm
-
-.macro SAD_X_START_8 h, x, f
- ld1 {v0.8b}, [x0], x9
- ld1 {v1.8b}, [x1], x5
- ld1 {v2.8b}, [x2], x5
- ld1 {v3.8b}, [x3], x5
- \f v16.8h, v0.8b, v1.8b
- \f v17.8h, v0.8b, v2.8b
- \f v18.8h, v0.8b, v3.8b
-.if \x == 4
- ld1 {v4.8b}, [x4], x5
- \f v19.8h, v0.8b, v4.8b
-.endif
-.endm
-
-.macro SAD_X_8 h x
-.rept \h - 1
- SAD_X_START_8 \h, \x, uabal
-.endr
-.endm
-
-.macro SAD_X_END_8 x
- SAD_X_END_4 \x
-.endm
-
-.macro SAD_X_START_12 x
- SAD_X_START_16 \x
-.endm
-
-.macro SAD_X_12 base v1 v2
- // v2: unused
- // v31: bitmask for 12xh blocks
- ld1 {v0.16b}, [ \base ], x5
- and v0.16b, v0.16b, v31.16b
-
- uabd v24.16b, v0.16b, v6.16b
- uadalp \v1\().8h, v24.16b
-.endm
-
-.macro SAD_X_END_12 x
- SAD_X_END_4 \x
-.endm
-
-.macro SAD_X_START_16 x
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
-.if \x == 4
- movi v19.16b, #0
-.endif
-.endm
-
-.macro SAD_X_16 base v1 v2
- // v2: unused
- ld1 {v0.16b}, [ \base ], x5
- uabd v24.16b, v0.16b, v6.16b
- uadalp \v1\().8h, v24.16b
-.endm
-
-.macro SAD_X_END_16 x
- SAD_X_END_4 \x
-.endm
-
-.macro SAD_X_START_LARGE x
- movi v16.16b, #0
- movi v17.16b, #0
- movi v18.16b, #0
- movi v20.16b, #0
- movi v21.16b, #0
- movi v22.16b, #0
-.if \x == 4
- movi v19.16b, #0
- movi v23.16b, #0
-.endif
-.endm
-
-.macro SAD_X_END_LARGE x
- uaddlp v16.4s, v16.8h
- uadalp v16.4s, v20.8h
- uaddlp v17.4s, v17.8h
- uadalp v17.4s, v21.8h
- uaddlp v18.4s, v18.8h
- uadalp v18.4s, v22.8h
-.if \x == 3
- addv s0, v16.4s
- addv s1, v17.4s
- addv s2, v18.4s
- stp s0, s1, [x6], #8
- str s2, [x6]
-.elseif \x == 4
- uaddlp v19.4s, v19.8h
- uadalp v19.4s, v23.8h
- addp v16.4s, v16.4s, v17.4s
- addp v18.4s, v18.4s, v19.4s
- addp v16.4s, v16.4s, v18.4s
- str q16, [x6]
-.endif
- ret
-.endm
-
-.macro SAD_X_START_24 x
- SAD_X_START_LARGE \x
- sub x5, x5, #16
- sub x9, x9, #16
-.endm
-
-.macro SAD_X_24 base v1 v2
- ld1 {v0.16b}, [ \base ], #16
- ld1 {v1.8b}, [ \base ], x5
- uabd v24.16b, v0.16b, v6.16b
- uadalp \v1\().8h, v24.16b
- uabal \v2\().8h, v1.8b, v7.8b
-.endm
-
-.macro SAD_X_END_24 x
- SAD_X_END_LARGE \x
-.endm
-
-.macro SAD_X_START_32 x
- SAD_X_START_LARGE \x
-.endm
-
-.macro SAD_X_32 base v1 v2
- ld1 {v0.16b-v1.16b}, [ \base ], x5
- uabd v24.16b, v0.16b, v6.16b
- uadalp \v1\().8h, v24.16b
- uabd v25.16b, v1.16b, v7.16b
- uadalp \v2\().8h, v25.16b
-.endm
-
-.macro SAD_X_END_32 x
- SAD_X_END_LARGE \x
-.endm
-
-.macro SAD_X_START_48 x
- SAD_X_START_LARGE \x
-.endm
-
-.macro SAD_X_48 base v1 v2
- ld1 {v0.16b-v2.16b}, [ \base ], x5
- uabd v24.16b, v0.16b, v4.16b
- uadalp \v1\().8h, v24.16b
- uabd v25.16b, v1.16b, v5.16b
- uadalp \v2\().8h, v25.16b
- uabd v26.16b, v2.16b, v6.16b
- uadalp \v1\().8h, v26.16b
-.endm
-
-.macro SAD_X_END_48 x
- SAD_X_END_LARGE \x
-.endm
-
-.macro SAD_X_START_64 x
- SAD_X_START_LARGE \x
-.endm
-
-.macro SAD_X_64 base v1 v2
- ld1 {v0.16b-v3.16b}, [ \base ], x5
- uabd v24.16b, v0.16b, v4.16b
- uadalp \v1\().8h, v24.16b
- uabd v25.16b, v1.16b, v5.16b
- uadalp \v2\().8h, v25.16b
- uabd v26.16b, v2.16b, v6.16b
- uadalp \v1\().8h, v26.16b
- uabd v27.16b, v3.16b, v7.16b
- uadalp \v2\().8h, v27.16b
-.endm
-
-.macro SAD_X_END_64 x
- SAD_X_END_LARGE \x
-.endm
-
-const sad12_mask, align=8
-.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
-endconst
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S
deleted file mode 100644
index 72432055d..000000000
--- a/source/common/aarch64/sad-a-sve2.S
+++ /dev/null
@@ -1,517 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2024 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- Hari Limaye <hari.limaye at arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-#include "sad-a-common.S"
-
-.arch armv8-a+sve2
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-.macro SAD_SVE2_16 h
- mov z16.d, #0
- ptrue p0.h, vl16
-.rept \h
- ld1b {z0.h}, p0/z, [x0]
- ld1b {z2.h}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- uaba z16.h, z0.h, z2.h
-.endr
- uaddv d0, p0, z16.h
- fmov w0, s0
- ret
-.endm
-
-.macro SAD_SVE2_32 h
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z4.b
- uabalt z16.h, z0.b, z4.b
-.endr
- uaddv d0, p0, z16.h
- fmov w0, s0
- ret
-.endm
-
-.macro SAD_SVE2_64 h
- cmp x9, #48
- bgt .vl_gt_48_pixel_sad_64x\h
- mov z16.d, #0
- mov z17.d, #0
- mov z18.d, #0
- mov z19.d, #0
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z4.b}, p0/z, [x2]
- ld1b {z5.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z4.b
- uabalt z17.h, z0.b, z4.b
- uabalb z18.h, z1.b, z5.b
- uabalt z19.h, z1.b, z5.b
-.endr
- add z16.h, z16.h, z17.h
- add z17.h, z18.h, z19.h
- add z16.h, z16.h, z17.h
- uadalp z24.s, p0/m, z16.h
- uaddv d5, p0, z24.s
- fmov x0, d5
- ret
-.vl_gt_48_pixel_sad_64x\h\():
- mov z16.d, #0
- mov z17.d, #0
- mov z24.d, #0
- ptrue p0.b, vl64
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z4.b
- uabalt z17.h, z0.b, z4.b
-.endr
- add z16.h, z16.h, z17.h
- uadalp z24.s, p0/m, z16.h
- uaddv d5, p0, z24.s
- fmov x0, d5
- ret
-.endm
-
-.macro SAD_SVE2_24 h
- mov z16.d, #0
- mov x10, #24
- mov x11, #0
- whilelt p0.b, x11, x10
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z8.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z8.b
- uabalt z16.h, z0.b, z8.b
-.endr
- uaddv d5, p0, z16.h
- fmov w0, s5
- ret
-.endm
-
-.macro SAD_SVE2_48 h
- cmp x9, #48
- bgt .vl_gt_48_pixel_sad_48x\h
- mov z16.d, #0
- mov z17.d, #0
- mov z18.d, #0
- mov z19.d, #0
- ptrue p0.b, vl32
- ptrue p1.b, vl16
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p1/z, [x0, #1, mul vl]
- ld1b {z8.b}, p0/z, [x2]
- ld1b {z9.b}, p1/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z8.b
- uabalt z17.h, z0.b, z8.b
- uabalb z18.h, z1.b, z9.b
- uabalt z19.h, z1.b, z9.b
-.endr
- add z16.h, z16.h, z17.h
- add z17.h, z18.h, z19.h
- add z16.h, z16.h, z17.h
- uaddv d5, p0, z16.h
- fmov w0, s5
- ret
-.vl_gt_48_pixel_sad_48x\h\():
- mov z16.d, #0
- mov z17.d, #0
- mov x10, #48
- mov x11, #0
- whilelt p0.b, x11, x10
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z8.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- uabalb z16.h, z0.b, z8.b
- uabalt z17.h, z0.b, z8.b
-.endr
- add z16.h, z16.h, z17.h
- uaddv d5, p0, z16.h
- fmov w0, s5
- ret
-.endm
-
-// Fully unrolled.
-.macro SAD_FUNC_SVE2 w, h
-function PFX(pixel_sad_\w\()x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sad_\w\()x\h
- SAD_START_\w uabdl
- SAD_\w \h
-.if \w > 4
- add v16.8h, v16.8h, v17.8h
-.endif
- uaddlv s0, v16.8h
- fmov w0, s0
- ret
-.vl_gt_16_pixel_sad_\w\()x\h\():
-.if \w == 4 || \w == 8 || \w == 12
- SAD_START_\w uabdl
- SAD_\w \h
-.if \w > 4
- add v16.8h, v16.8h, v17.8h
-.endif
- uaddlv s0, v16.8h
- fmov w0, s0
- ret
-.else
- SAD_SVE2_\w \h
-.endif
-endfunc
-.endm
-
-// Loop unrolled to process 4 rows per iteration.
-.macro SAD_FUNC_LOOP_SVE2 w, h
-function PFX(pixel_sad_\w\()x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sad_loop_\w\()x\h
- SAD_START_\w
-
- mov w9, #\h/4
-.Loop_sve2_\w\()x\h:
- sub w9, w9, #1
-.rept 2
- SAD_\w
-.endr
- cbnz w9, .Loop_sve2_\w\()x\h
-
- SAD_END_\w
-
-.vl_gt_16_pixel_sad_loop_\w\()x\h\():
-.if \w == 4 || \w == 8 || \w == 12
- SAD_START_\w
-
- mov w9, #\h/8
-.Loop_sve2_loop_\w\()x\h:
- sub w9, w9, #1
-.rept 4
- SAD_\w
-.endr
- cbnz w9, .Loop_sve2_loop_\w\()x\h
-
- SAD_END_\w
-.else
- SAD_SVE2_\w \h
-.endif
-endfunc
-.endm
-
-SAD_FUNC_SVE2 4, 4
-SAD_FUNC_SVE2 4, 8
-SAD_FUNC_SVE2 4, 16
-SAD_FUNC_SVE2 8, 4
-SAD_FUNC_SVE2 8, 8
-SAD_FUNC_SVE2 8, 16
-SAD_FUNC_SVE2 8, 32
-
-SAD_FUNC_LOOP_SVE2 16, 4
-SAD_FUNC_LOOP_SVE2 16, 8
-SAD_FUNC_LOOP_SVE2 16, 12
-SAD_FUNC_LOOP_SVE2 16, 16
-SAD_FUNC_LOOP_SVE2 16, 32
-SAD_FUNC_LOOP_SVE2 16, 64
-SAD_FUNC_LOOP_SVE2 32, 8
-SAD_FUNC_LOOP_SVE2 32, 16
-SAD_FUNC_LOOP_SVE2 32, 24
-SAD_FUNC_LOOP_SVE2 32, 32
-SAD_FUNC_LOOP_SVE2 32, 64
-SAD_FUNC_LOOP_SVE2 64, 16
-SAD_FUNC_LOOP_SVE2 64, 32
-SAD_FUNC_LOOP_SVE2 64, 48
-SAD_FUNC_LOOP_SVE2 64, 64
-SAD_FUNC_LOOP_SVE2 12, 16
-SAD_FUNC_LOOP_SVE2 24, 32
-SAD_FUNC_LOOP_SVE2 48, 64
-
-// SAD_X3 and SAD_X4 code start
-
-.macro SAD_X_SVE2_24_INNER_GT_16 base z
- ld1b {z4.b}, p0/z, [ \base ]
- add \base, \base, x5
- uabalb \z\().h, z4.b, z0.b
- uabalt \z\().h, z4.b, z0.b
-.endm
-
-.macro SAD_X_SVE2_24 h x
- mov z20.d, #0
- mov z21.d, #0
- mov z22.d, #0
- mov z23.d, #0
- mov x10, #24
- mov x11, #0
- whilelt p0.b, x11, x10
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- add x0, x0, x9
- SAD_X_SVE2_24_INNER_GT_16 x1, z20
- SAD_X_SVE2_24_INNER_GT_16 x2, z21
- SAD_X_SVE2_24_INNER_GT_16 x3, z22
-.if \x == 4
- SAD_X_SVE2_24_INNER_GT_16 x4, z23
-.endif
-.endr
- uaddlv s0, v20.8h
- uaddlv s1, v21.8h
- uaddlv s2, v22.8h
- stp s0, s1, [x6]
-.if \x == 3
- str s2, [x6, #8]
-.elseif \x == 4
- uaddv d0, p0, z20.h
- uaddv d1, p0, z21.h
- uaddv d2, p0, z22.h
- stp s2, s3, [x6, #8]
-.endif
- ret
-.endm
-
-.macro SAD_X_SVE2_32_INNER_GT_16 base z
- ld1b {z4.b}, p0/z, [ \base ]
- add \base, \base, x5
- uabalb \z\().h, z4.b, z0.b
- uabalt \z\().h, z4.b, z0.b
-.endm
-
-.macro SAD_X_SVE2_32 h x
- mov z20.d, #0
- mov z21.d, #0
- mov z22.d, #0
- mov z23.d, #0
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.b}, p0/z, [x0]
- add x0, x0, x9
- SAD_X_SVE2_32_INNER_GT_16 x1, z20
- SAD_X_SVE2_32_INNER_GT_16 x2, z21
- SAD_X_SVE2_32_INNER_GT_16 x3, z22
-.if \x == 4
- SAD_X_SVE2_32_INNER_GT_16 x4, z23
-.endif
-.endr
- uaddv d0, p0, z20.h
- uaddv d1, p0, z21.h
- uaddv d2, p0, z22.h
- stp s0, s1, [x6]
-.if \x == 3
- str s2, [x6, #8]
-.elseif \x == 4
- uaddv d3, p0, z23.h
- stp s2, s3, [x6, #8]
-.endif
- ret
-.endm
-
-// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores[3])
-// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores[4])
-.macro SAD_X_FUNC_SVE2 x, w, h
-function PFX(sad_x\x\()_\w\()x\h\()_sve2)
- mov x9, #FENC_STRIDE
-
-// Make function arguments for x == 3 look like x == 4.
-.if \x == 3
- mov x6, x5
- mov x5, x4
-.endif
- rdvl x11, #1
- cmp x11, #16
- bgt .vl_gt_16_sad_x\x\()_\w\()x\h
-.if \w == 12
- movrel x12, sad12_mask
- ld1 {v31.16b}, [x12]
-.endif
-
- SAD_X_START_\w \h, \x, uabdl
- SAD_X_\w \h, \x
- SAD_X_END_\w \x
-.vl_gt_16_sad_x\x\()_\w\()x\h\():
-.if \w == 24 || \w == 32
- SAD_X_SVE2_\w \h, \x
-.else
-.if \w == 12
- movrel x12, sad12_mask
- ld1 {v31.16b}, [x12]
-.endif
-
- SAD_X_START_\w \h, \x, uabdl
- SAD_X_\w \h, \x
- SAD_X_END_\w \x
-.endif
-endfunc
-.endm
-
-.macro SAD_X_LOOP_SVE2 x, w, h
-function PFX(sad_x\x\()_\w\()x\h\()_sve2)
- mov x9, #FENC_STRIDE
-
-// Make function arguments for x == 3 look like x == 4.
-.if \x == 3
- mov x6, x5
- mov x5, x4
-.endif
- rdvl x11, #1
- cmp x11, #16
- bgt .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
- SAD_X_START_\w \x
- mov w12, #\h/4
-.Loop_sad_sve2_x\x\()_\w\()x\h:
- sub w12, w12, #1
- .rept 4
- .if \w == 12
- ld1 {v6.16b}, [x0], x9
- and v6.16b, v6.16b, v31.16b
- .elseif \w == 16
- ld1 {v6.16b}, [x0], x9
- .elseif \w == 24
- ld1 {v6.16b}, [x0], #16
- ld1 {v7.8b}, [x0], x9
- .elseif \w == 32
- ld1 {v6.16b-v7.16b}, [x0], x9
- .elseif \w == 48
- ld1 {v4.16b-v6.16b}, [x0], x9
- .elseif \w == 64
- ld1 {v4.16b-v7.16b}, [x0], x9
- .endif
- SAD_X_\w x1, v16, v20
- SAD_X_\w x2, v17, v21
- SAD_X_\w x3, v18, v22
- .if \x == 4
- SAD_X_\w x4, v19, v23
- .endif
- .endr
- cbnz w12, .Loop_sad_sve2_x\x\()_\w\()x\h
- SAD_X_END_\w \x
-.vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
-.if \w == 24 || \w == 32
- SAD_X_SVE2_\w \h, \x
- ret
-.else
- SAD_X_START_\w \x
- mov w12, #\h/4
-.Loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
- sub w12, w12, #1
- .rept 4
- .if \w == 24
- ld1 {v6.16b}, [x0], #16
- ld1 {v7.8b}, [x0], x9
- .elseif \w == 32
- ld1 {v6.16b-v7.16b}, [x0], x9
- .elseif \w == 48
- ld1 {v4.16b-v6.16b}, [x0], x9
- .elseif \w == 64
- ld1 {v4.16b-v7.16b}, [x0], x9
- .endif
- SAD_X_\w x1, v16, v20
- SAD_X_\w x2, v17, v21
- SAD_X_\w x3, v18, v22
- .if \x == 4
- SAD_X_\w x4, v19, v23
- .endif
- .endr
- cbnz w12, .Loop_sad_sve2_gt_16_x\x\()_\w\()x\h
- SAD_X_END_\w \x
-.endif
-endfunc
-.endm
-
-
-SAD_X_FUNC_SVE2 3, 4, 4
-SAD_X_FUNC_SVE2 3, 4, 8
-SAD_X_FUNC_SVE2 3, 4, 16
-SAD_X_FUNC_SVE2 3, 8, 4
-SAD_X_FUNC_SVE2 3, 8, 8
-SAD_X_FUNC_SVE2 3, 8, 16
-SAD_X_FUNC_SVE2 3, 8, 32
-SAD_X_LOOP_SVE2 3, 12, 16
-SAD_X_LOOP_SVE2 3, 16, 4
-SAD_X_LOOP_SVE2 3, 16, 8
-SAD_X_LOOP_SVE2 3, 16, 12
-SAD_X_LOOP_SVE2 3, 16, 16
-SAD_X_LOOP_SVE2 3, 16, 32
-SAD_X_LOOP_SVE2 3, 16, 64
-SAD_X_LOOP_SVE2 3, 24, 32
-SAD_X_LOOP_SVE2 3, 32, 8
-SAD_X_LOOP_SVE2 3, 32, 16
-SAD_X_LOOP_SVE2 3, 32, 24
-SAD_X_LOOP_SVE2 3, 32, 32
-SAD_X_LOOP_SVE2 3, 32, 64
-SAD_X_LOOP_SVE2 3, 48, 64
-SAD_X_LOOP_SVE2 3, 64, 16
-SAD_X_LOOP_SVE2 3, 64, 32
-SAD_X_LOOP_SVE2 3, 64, 48
-SAD_X_LOOP_SVE2 3, 64, 64
-
-SAD_X_FUNC_SVE2 4, 4, 4
-SAD_X_FUNC_SVE2 4, 4, 8
-SAD_X_FUNC_SVE2 4, 4, 16
-SAD_X_FUNC_SVE2 4, 8, 4
-SAD_X_FUNC_SVE2 4, 8, 8
-SAD_X_FUNC_SVE2 4, 8, 16
-SAD_X_FUNC_SVE2 4, 8, 32
-SAD_X_LOOP_SVE2 4, 12, 16
-SAD_X_LOOP_SVE2 4, 16, 4
-SAD_X_LOOP_SVE2 4, 16, 8
-SAD_X_LOOP_SVE2 4, 16, 12
-SAD_X_LOOP_SVE2 4, 16, 16
-SAD_X_LOOP_SVE2 4, 16, 32
-SAD_X_LOOP_SVE2 4, 16, 64
-SAD_X_LOOP_SVE2 4, 24, 32
-SAD_X_LOOP_SVE2 4, 32, 8
-SAD_X_LOOP_SVE2 4, 32, 16
-SAD_X_LOOP_SVE2 4, 32, 24
-SAD_X_LOOP_SVE2 4, 32, 32
-SAD_X_LOOP_SVE2 4, 32, 64
-SAD_X_LOOP_SVE2 4, 48, 64
-SAD_X_LOOP_SVE2 4, 64, 16
-SAD_X_LOOP_SVE2 4, 64, 32
-SAD_X_LOOP_SVE2 4, 64, 48
-SAD_X_LOOP_SVE2 4, 64, 64
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 4fef9e24c..5dc50fb9d 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -24,7 +24,6 @@
*****************************************************************************/
#include "asm.S"
-#include "sad-a-common.S"
#ifdef __APPLE__
.section __RODATA,__rodata
@@ -36,6 +35,225 @@
.text
+.macro SAD_START_4 f
+ ldr s0, [x0]
+ ldr s1, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ ld1 {v0.s}[1], [x0], x1
+ ld1 {v1.s}[1], [x2], x3
+ \f v16.8h, v0.8b, v1.8b
+.endm
+
+.macro SAD_4 h
+.rept \h / 2 - 1
+ SAD_START_4 uabal
+.endr
+.endm
+
+.macro SAD_START_8 f
+ ld1 {v0.8b}, [x0], x1
+ ld1 {v1.8b}, [x2], x3
+ ld1 {v2.8b}, [x0], x1
+ ld1 {v3.8b}, [x2], x3
+ \f v16.8h, v0.8b, v1.8b
+ \f v17.8h, v2.8b, v3.8b
+.endm
+
+.macro SAD_8 h
+.rept \h / 2 - 1
+ SAD_START_8 uabal
+.endr
+.endm
+
+.macro SAD_START_16
+ movi v16.16b, #0
+ movi v17.16b, #0
+.endm
+
+.macro SAD_16
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x2], x3
+ ld1 {v2.16b}, [x0], x1
+ ld1 {v3.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v1.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v2.16b, v3.16b
+ uadalp v17.8h, v21.16b
+.endm
+
+.macro SAD_END_16
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+ fmov x0, d0
+ ret
+.endm
+
+.macro SAD_START_32
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+.endm
+
+.macro SAD_32
+ ld1 {v0.16b-v1.16b}, [x0], x1
+ ld1 {v2.16b-v3.16b}, [x2], x3
+ ld1 {v4.16b-v5.16b}, [x0], x1
+ ld1 {v6.16b-v7.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v2.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v3.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v4.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v5.16b, v7.16b
+ uadalp v19.8h, v23.16b
+.endm
+
+.macro SAD_END_32
+ add v16.8h, v16.8h, v17.8h
+ add v17.8h, v18.8h, v19.8h
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+.endm
+
+.macro SAD_START_64
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v19.16b, #0
+.endm
+
+.macro SAD_64
+ ld1 {v0.16b-v3.16b}, [x0], x1
+ ld1 {v4.16b-v7.16b}, [x2], x3
+ ld1 {v24.16b-v27.16b}, [x0], x1
+ ld1 {v28.16b-v31.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v4.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v5.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v2.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v3.16b, v7.16b
+ uadalp v19.8h, v23.16b
+ uabd v20.16b, v24.16b, v28.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v25.16b, v29.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v26.16b, v30.16b
+ uadalp v18.8h, v22.16b
+ uabd v23.16b, v27.16b, v31.16b
+ uadalp v19.8h, v23.16b
+.endm
+
+.macro SAD_END_64
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ uadalp v16.4s, v18.8h
+ uadalp v16.4s, v19.8h
+ uaddlv d0, v16.4s
+ fmov x0, d0
+ ret
+.endm
+
+.macro SAD_START_12
+ movrel x12, sad12_mask
+ ld1 {v31.16b}, [x12]
+ movi v16.16b, #0
+ movi v17.16b, #0
+.endm
+
+.macro SAD_12
+ ld1 {v0.16b}, [x0], x1
+ and v0.16b, v0.16b, v31.16b
+ ld1 {v1.16b}, [x2], x3
+ and v1.16b, v1.16b, v31.16b
+ ld1 {v2.16b}, [x0], x1
+ and v2.16b, v2.16b, v31.16b
+ ld1 {v3.16b}, [x2], x3
+ and v3.16b, v3.16b, v31.16b
+ uabd v20.16b, v0.16b, v1.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v2.16b, v3.16b
+ uadalp v17.8h, v21.16b
+.endm
+
+.macro SAD_END_12
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+.endm
+
+.macro SAD_START_24
+ movi v16.16b, #0
+ movi v17.16b, #0
+ sub x1, x1, #16
+ sub x3, x3, #16
+.endm
+
+.macro SAD_24
+ ld1 {v0.16b}, [x0], #16
+ ld1 {v1.8b}, [x0], x1
+ ld1 {v2.16b}, [x2], #16
+ ld1 {v3.8b}, [x2], x3
+ ld1 {v4.16b}, [x0], #16
+ ld1 {v5.8b}, [x0], x1
+ ld1 {v6.16b}, [x2], #16
+ ld1 {v7.8b}, [x2], x3
+ uabd v20.16b, v0.16b, v2.16b
+ uadalp v16.8h, v20.16b
+ uabal v17.8h, v1.8b, v3.8b
+ uabd v20.16b, v4.16b, v6.16b
+ uadalp v16.8h, v20.16b
+ uabal v17.8h, v5.8b, v7.8b
+.endm
+
+.macro SAD_END_24
+ add v16.8h, v16.8h, v17.8h
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+.endm
+
+.macro SAD_START_48
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+.endm
+
+.macro SAD_48
+ ld1 {v0.16b-v2.16b}, [x0], x1
+ ld1 {v4.16b-v6.16b}, [x2], x3
+ ld1 {v24.16b-v26.16b}, [x0], x1
+ ld1 {v28.16b-v30.16b}, [x2], x3
+ uabd v20.16b, v0.16b, v4.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v1.16b, v5.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v2.16b, v6.16b
+ uadalp v18.8h, v22.16b
+ uabd v20.16b, v24.16b, v28.16b
+ uadalp v16.8h, v20.16b
+ uabd v21.16b, v25.16b, v29.16b
+ uadalp v17.8h, v21.16b
+ uabd v22.16b, v26.16b, v30.16b
+ uadalp v18.8h, v22.16b
+.endm
+
+.macro SAD_END_48
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v17.8h
+ uadalp v16.4s, v18.8h
+ uaddlv d0, v16.4s
+ fmov x0, d0
+ ret
+.endm
+
// Fully unrolled.
.macro SAD_FUNC w, h
function PFX(pixel_sad_\w\()x\h\()_neon)
@@ -96,6 +314,221 @@ SAD_FUNC_LOOP 48, 64
// SAD_X3 and SAD_X4 code start
+.macro SAD_X_START_4 h, x, f
+ ldr s0, [x0]
+ ldr s1, [x1]
+ ldr s2, [x2]
+ ldr s3, [x3]
+ add x0, x0, x9
+ add x1, x1, x5
+ add x2, x2, x5
+ add x3, x3, x5
+ ld1 {v0.s}[1], [x0], x9
+ ld1 {v1.s}[1], [x1], x5
+ ld1 {v2.s}[1], [x2], x5
+ ld1 {v3.s}[1], [x3], x5
+ \f v16.8h, v0.8b, v1.8b
+ \f v17.8h, v0.8b, v2.8b
+ \f v18.8h, v0.8b, v3.8b
+.if \x == 4
+ ldr s4, [x4]
+ add x4, x4, x5
+ ld1 {v4.s}[1], [x4], x5
+ \f v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_4 h, x
+.rept \h/2 - 1
+ SAD_X_START_4 \h, \x, uabal
+.endr
+.endm
+
+.macro SAD_X_END_4 x
+ uaddlv s0, v16.8h
+ uaddlv s1, v17.8h
+ uaddlv s2, v18.8h
+ stp s0, s1, [x6]
+.if \x == 3
+ str s2, [x6, #8]
+.elseif \x == 4
+ uaddlv s3, v19.8h
+ stp s2, s3, [x6, #8]
+.endif
+ ret
+.endm
+
+.macro SAD_X_START_8 h, x, f
+ ld1 {v0.8b}, [x0], x9
+ ld1 {v1.8b}, [x1], x5
+ ld1 {v2.8b}, [x2], x5
+ ld1 {v3.8b}, [x3], x5
+ \f v16.8h, v0.8b, v1.8b
+ \f v17.8h, v0.8b, v2.8b
+ \f v18.8h, v0.8b, v3.8b
+.if \x == 4
+ ld1 {v4.8b}, [x4], x5
+ \f v19.8h, v0.8b, v4.8b
+.endif
+.endm
+
+.macro SAD_X_8 h x
+.rept \h - 1
+ SAD_X_START_8 \h, \x, uabal
+.endr
+.endm
+
+.macro SAD_X_END_8 x
+ SAD_X_END_4 \x
+.endm
+
+.macro SAD_X_START_12 x
+ SAD_X_START_16 \x
+.endm
+
+.macro SAD_X_12 base v1 v2
+ // v2: unused
+ // v31: bitmask for 12xh blocks
+ ld1 {v0.16b}, [ \base ], x5
+ and v0.16b, v0.16b, v31.16b
+
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+.endm
+
+.macro SAD_X_END_12 x
+ SAD_X_END_4 \x
+.endm
+
+.macro SAD_X_START_16 x
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+.if \x == 4
+ movi v19.16b, #0
+.endif
+.endm
+
+.macro SAD_X_16 base v1 v2
+ // v2: unused
+ ld1 {v0.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+.endm
+
+.macro SAD_X_END_16 x
+ SAD_X_END_4 \x
+.endm
+
+.macro SAD_X_START_LARGE x
+ movi v16.16b, #0
+ movi v17.16b, #0
+ movi v18.16b, #0
+ movi v20.16b, #0
+ movi v21.16b, #0
+ movi v22.16b, #0
+.if \x == 4
+ movi v19.16b, #0
+ movi v23.16b, #0
+.endif
+.endm
+
+.macro SAD_X_END_LARGE x
+ uaddlp v16.4s, v16.8h
+ uadalp v16.4s, v20.8h
+ uaddlp v17.4s, v17.8h
+ uadalp v17.4s, v21.8h
+ uaddlp v18.4s, v18.8h
+ uadalp v18.4s, v22.8h
+.if \x == 3
+ addv s0, v16.4s
+ addv s1, v17.4s
+ addv s2, v18.4s
+ stp s0, s1, [x6], #8
+ str s2, [x6]
+.elseif \x == 4
+ uaddlp v19.4s, v19.8h
+ uadalp v19.4s, v23.8h
+ addp v16.4s, v16.4s, v17.4s
+ addp v18.4s, v18.4s, v19.4s
+ addp v16.4s, v16.4s, v18.4s
+ str q16, [x6]
+.endif
+ ret
+.endm
+
+.macro SAD_X_START_24 x
+ SAD_X_START_LARGE \x
+ sub x5, x5, #16
+ sub x9, x9, #16
+.endm
+
+.macro SAD_X_24 base v1 v2
+ ld1 {v0.16b}, [ \base ], #16
+ ld1 {v1.8b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+ uabal \v2\().8h, v1.8b, v7.8b
+.endm
+
+.macro SAD_X_END_24 x
+ SAD_X_END_LARGE \x
+.endm
+
+.macro SAD_X_START_32 x
+ SAD_X_START_LARGE \x
+.endm
+
+.macro SAD_X_32 base v1 v2
+ ld1 {v0.16b-v1.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v6.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v7.16b
+ uadalp \v2\().8h, v25.16b
+.endm
+
+.macro SAD_X_END_32 x
+ SAD_X_END_LARGE \x
+.endm
+
+.macro SAD_X_START_48 x
+ SAD_X_START_LARGE \x
+.endm
+
+.macro SAD_X_48 base v1 v2
+ ld1 {v0.16b-v2.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v5.16b
+ uadalp \v2\().8h, v25.16b
+ uabd v26.16b, v2.16b, v6.16b
+ uadalp \v1\().8h, v26.16b
+.endm
+
+.macro SAD_X_END_48 x
+ SAD_X_END_LARGE \x
+.endm
+
+.macro SAD_X_START_64 x
+ SAD_X_START_LARGE \x
+.endm
+
+.macro SAD_X_64 base v1 v2
+ ld1 {v0.16b-v3.16b}, [ \base ], x5
+ uabd v24.16b, v0.16b, v4.16b
+ uadalp \v1\().8h, v24.16b
+ uabd v25.16b, v1.16b, v5.16b
+ uadalp \v2\().8h, v25.16b
+ uabd v26.16b, v2.16b, v6.16b
+ uadalp \v1\().8h, v26.16b
+ uabd v27.16b, v3.16b, v7.16b
+ uadalp \v2\().8h, v27.16b
+.endm
+
+.macro SAD_X_END_64 x
+ SAD_X_END_LARGE \x
+.endm
+
// static void x264_pixel_sad_x3_##size(pixel *fenc, pixel *pix0, pixel *pix1, pixel *pix2, intptr_t i_stride, int scores[3])
// static void x264_pixel_sad_x4_##size(pixel *fenc, pixel *pix0, pixel *pix1,pixel *pix2, pixel *pix3, intptr_t i_stride, int scores[4])
.macro SAD_X_FUNC x, w, h
@@ -213,3 +646,7 @@ SAD_X_LOOP 4, 64, 16
SAD_X_LOOP 4, 64, 32
SAD_X_LOOP 4, 64, 48
SAD_X_LOOP 4, 64, 64
+
+const sad12_mask, align=8
+.byte 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 0, 0, 0, 0
+endconst
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0003-AArch64-Remove-SVE2-SAD-SADxN-primitives.patch
Type: text/x-patch
Size: 43099 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240523/2909e9f4/attachment-0001.bin>
More information about the x265-devel
mailing list