[x265] [PATCH 2/3] AArch64: Remove SVE and SVE2 sse_pp primitives
Hari Limaye
hari.limaye at arm.com
Tue Jun 25 12:52:31 UTC 2024
Remove the SVE and SVE2 implementations of sse_pp primitives, as these
are now slower than the optimised Neon implementations. Additionally the
SVE2 implementations for 32x32 and 32x64 block sizes contain bugs and
produce the wrong result when SVE vector length is greater than 128
bits.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 13 --
source/common/aarch64/ssd-a-sve.S | 78 -------
source/common/aarch64/ssd-a-sve2.S | 261 -----------------------
4 files changed, 1 insertion(+), 353 deletions(-)
delete mode 100644 source/common/aarch64/ssd-a-sve.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 2b382c856..568c1a50c 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -110,7 +110,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
# add ARM assembly/intrinsic files here
set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S ipfilter.S ipfilter-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S)
- set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+ set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ipfilter-sve2.S ssd-a-sve2.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S)
set(VEC_PRIMITIVES)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index bc0798c4e..fc7d205db 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1010,12 +1010,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
#if !HIGH_BIT_DEPTH
- // sse_pp
- p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = PFX(pixel_sse_pp_4x8_sve);
-
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
// satd
@@ -1069,13 +1063,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // sse_pp
- p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_sve2);
- p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_sve2);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_sse_pp_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve2);
-
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve2);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve2);
diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S
deleted file mode 100644
index 74fea4513..000000000
--- a/source/common/aarch64/ssd-a-sve.S
+++ /dev/null
@@ -1,78 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(pixel_sse_pp_4x4_sve)
- ptrue p0.s, vl4
- ld1b {z0.s}, p0/z, [x0]
- ld1b {z17.s}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- sub z0.s, p0/m, z0.s, z17.s
- mul z0.s, p0/m, z0.s, z0.s
-.rept 3
- ld1b {z16.s}, p0/z, [x0]
- ld1b {z17.s}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- sub z16.s, p0/m, z16.s, z17.s
- mla z0.s, p0/m, z16.s, z16.s
-.endr
- uaddv d0, p0, z0.s
- fmov w0, s0
- ret
-endfunc
-
-function PFX(pixel_sse_pp_4x8_sve)
- ptrue p0.s, vl4
- ld1b {z0.s}, p0/z, [x0]
- ld1b {z17.s}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- sub z0.s, p0/m, z0.s, z17.s
- mul z0.s, p0/m, z0.s, z0.s
-.rept 7
- ld1b {z16.s}, p0/z, [x0]
- ld1b {z17.s}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- sub z16.s, p0/m, z16.s, z17.s
- mla z0.s, p0/m, z16.s, z16.s
-.endr
- uaddv d0, p0, z0.s
- fmov w0, s0
- ret
-endfunc
diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S
index 8077bd93c..b3e84b69b 100644
--- a/source/common/aarch64/ssd-a-sve2.S
+++ b/source/common/aarch64/ssd-a-sve2.S
@@ -36,267 +36,6 @@
.text
-function PFX(pixel_sse_pp_32x32_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_pp_32x32
- mov w12, #8
- movi v0.16b, #0
- movi v1.16b, #0
-.Loop_sse_pp_32_sve2:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b,v17.16b}, [x0], x1
- ld1 {v18.16b,v19.16b}, [x2], x3
- usubl v2.8h, v16.8b, v18.8b
- usubl2 v3.8h, v16.16b, v18.16b
- usubl v4.8h, v17.8b, v19.8b
- usubl2 v5.8h, v17.16b, v19.16b
- smlal v0.4s, v2.4h, v2.4h
- smlal2 v1.4s, v2.8h, v2.8h
- smlal v0.4s, v3.4h, v3.4h
- smlal2 v1.4s, v3.8h, v3.8h
- smlal v0.4s, v4.4h, v4.4h
- smlal2 v1.4s, v4.8h, v4.8h
- smlal v0.4s, v5.4h, v5.4h
- smlal2 v1.4s, v5.8h, v5.8h
-.endr
- cbnz w12, .Loop_sse_pp_32_sve2
- add v0.4s, v0.4s, v1.4s
- ret_v0_w0
-.vl_gt_16_pixel_sse_pp_32x32:
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- smullb z0.s, z1.h, z1.h
- smlalt z0.s, z1.h, z1.h
- smlalb z0.s, z2.h, z2.h
- smlalt z0.s, z2.h, z2.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- smullb z0.s, z1.h, z1.h
- smlalt z0.s, z1.h, z1.h
- smlalb z0.s, z2.h, z2.h
- smlalt z0.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z0.s
- fmov w0, s3
- ret
-endfunc
-
-function PFX(pixel_sse_pp_32x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_pp_32x64
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- usublb z3.h, z17.b, z19.b
- usublt z4.h, z17.b, z19.b
- smullb z20.s, z1.h, z1.h
- smullt z21.s, z1.h, z1.h
- smlalb z20.s, z2.h, z2.h
- smlalt z21.s, z2.h, z2.h
- smlalb z20.s, z3.h, z3.h
- smlalt z21.s, z3.h, z3.h
- smlalb z20.s, z4.h, z4.h
- smlalt z21.s, z4.h, z4.h
-.rept 63
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- usublb z3.h, z17.b, z19.b
- usublt z4.h, z17.b, z19.b
- smlalb z20.s, z1.h, z1.h
- smlalt z21.s, z1.h, z1.h
- smlalb z20.s, z2.h, z2.h
- smlalt z21.s, z2.h, z2.h
- smlalb z20.s, z3.h, z3.h
- smlalt z21.s, z3.h, z3.h
- smlalb z20.s, z4.h, z4.h
- smlalt z21.s, z4.h, z4.h
-.endr
- uaddv d3, p0, z20.s
- fmov w0, s3
- uaddv d4, p0, z21.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_pp_32x64:
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- smullb z20.s, z1.h, z1.h
- smullt z21.s, z1.h, z1.h
- smlalb z20.s, z2.h, z2.h
- smlalt z21.s, z2.h, z2.h
-.rept 63
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z18.b
- usublt z2.h, z16.b, z18.b
- smlalb z20.s, z1.h, z1.h
- smlalt z21.s, z1.h, z1.h
- smlalb z20.s, z2.h, z2.h
- smlalt z21.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z20.s
- fmov w0, s3
- uaddv d4, p0, z21.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_pp_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_pp_64x64
- mov w12, #16
- movi v0.16b, #0
- movi v1.16b, #0
-
-.Loop_sse_pp_64_sve2:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v19.16b}, [x0], x1
- ld1 {v20.16b-v23.16b}, [x2], x3
-
- usubl v2.8h, v16.8b, v20.8b
- usubl2 v3.8h, v16.16b, v20.16b
- usubl v4.8h, v17.8b, v21.8b
- usubl2 v5.8h, v17.16b, v21.16b
- smlal v0.4s, v2.4h, v2.4h
- smlal2 v1.4s, v2.8h, v2.8h
- smlal v0.4s, v3.4h, v3.4h
- smlal2 v1.4s, v3.8h, v3.8h
- smlal v0.4s, v4.4h, v4.4h
- smlal2 v1.4s, v4.8h, v4.8h
- smlal v0.4s, v5.4h, v5.4h
- smlal2 v1.4s, v5.8h, v5.8h
-
- usubl v2.8h, v18.8b, v22.8b
- usubl2 v3.8h, v18.16b, v22.16b
- usubl v4.8h, v19.8b, v23.8b
- usubl2 v5.8h, v19.16b, v23.16b
- smlal v0.4s, v2.4h, v2.4h
- smlal2 v1.4s, v2.8h, v2.8h
- smlal v0.4s, v3.4h, v3.4h
- smlal2 v1.4s, v3.8h, v3.8h
- smlal v0.4s, v4.4h, v4.4h
- smlal2 v1.4s, v4.8h, v4.8h
- smlal v0.4s, v5.4h, v5.4h
- smlal2 v1.4s, v5.8h, v5.8h
-.endr
- cbnz w12, .Loop_sse_pp_64_sve2
- add v0.4s, v0.4s, v1.4s
- ret_v0_w0
-.vl_gt_16_pixel_sse_pp_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_sse_pp_64x64
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z20.b
- usublt z2.h, z16.b, z20.b
- usublb z3.h, z17.b, z21.b
- usublt z4.h, z17.b, z21.b
- smullb z24.s, z1.h, z1.h
- smullt z25.s, z1.h, z1.h
- smlalb z24.s, z2.h, z2.h
- smlalt z25.s, z2.h, z2.h
- smlalb z24.s, z3.h, z3.h
- smlalt z25.s, z3.h, z3.h
- smlalb z24.s, z4.h, z4.h
- smlalt z25.s, z4.h, z4.h
-.rept 63
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z20.b
- usublt z2.h, z16.b, z20.b
- usublb z3.h, z17.b, z21.b
- usublt z4.h, z17.b, z21.b
- smlalb z24.s, z1.h, z1.h
- smlalt z25.s, z1.h, z1.h
- smlalb z24.s, z2.h, z2.h
- smlalt z25.s, z2.h, z2.h
- smlalb z24.s, z3.h, z3.h
- smlalt z25.s, z3.h, z3.h
- smlalb z24.s, z4.h, z4.h
- smlalt z25.s, z4.h, z4.h
-.endr
- uaddv d3, p0, z24.s
- fmov w0, s3
- uaddv d4, p0, z25.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_48_pixel_sse_pp_64x64:
- ptrue p0.b, vl64
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z20.b
- usublt z2.h, z16.b, z20.b
- smullb z24.s, z1.h, z1.h
- smullt z25.s, z1.h, z1.h
- smlalb z24.s, z2.h, z2.h
- smlalt z25.s, z2.h, z2.h
-.rept 63
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1
- add x2, x2, x3
- usublb z1.h, z16.b, z20.b
- usublt z2.h, z16.b, z20.b
- smlalb z24.s, z1.h, z1.h
- smlalt z25.s, z1.h, z1.h
- smlalb z24.s, z2.h, z2.h
- smlalt z25.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z24.s
- fmov w0, s3
- uaddv d4, p0, z25.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
function PFX(pixel_sse_ss_4x4_sve2)
ptrue p0.b, vl8
ld1b {z16.b}, p0/z, [x0]
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Remove-SVE-and-SVE2-sse_pp-primitives.patch
Type: text/x-patch
Size: 15438 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240625/b6f1679a/attachment-0001.bin>
More information about the x265-devel
mailing list