[x265] [PATCH 8/8] AArch64: Implement cpy1Dto2D_shr using Neon intrinsics
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:44:09 UTC 2025
Delete the Neon and SVE assembly implementations of these kernels as
they are slower than the new Neon intrinsics implementation.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 12 --
source/common/aarch64/blockcopy8-common.S | 39 ----
source/common/aarch64/blockcopy8-sve.S | 206 ----------------------
source/common/aarch64/blockcopy8.S | 99 -----------
source/common/aarch64/pixel-prim.cpp | 36 ++++
6 files changed, 37 insertions(+), 357 deletions(-)
delete mode 100644 source/common/aarch64/blockcopy8-common.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index a6f56c8c8..7eb40fb05 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -112,7 +112,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
enable_language(ASM)
# Add Arm assembly files here.
- set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6d4e0b67a..536af1d5d 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy1Dto2D_shr
- p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
-
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
@@ -600,11 +593,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy1Dto2D_shr
- p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
-
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
deleted file mode 100644
index 6d92756fc..000000000
--- a/source/common/aarch64/blockcopy8-common.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch armv8-a
-
-// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-.macro cpy1Dto2D_shr_start
- add x2, x2, x2
- dup v0.8h, w3
- cmeq v1.8h, v1.8h, v1.8h
- sshl v1.8h, v1.8h, v0.8h
- sri v1.8h, v1.8h, #1
- neg v0.8h, v0.8h
-.endm
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index e2154414c..401167038 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -22,7 +22,6 @@
*****************************************************************************/
#include "asm-sve.S"
-#include "blockcopy8-common.S"
.arch armv8-a+sve
@@ -69,208 +68,3 @@ function PFX(blockfill_s_32x32_sve)
.endr
ret
endfunc
-
-// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-
-function PFX(cpy1Dto2D_shr_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_16x16
- cpy1Dto2D_shr_start
- mov w12, #4
-.Loop_cpy1Dto2D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_16
- ret
-.vl_gt_16_cpy1Dto2D_shr_16x16:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #32
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_32x32
- cpy1Dto2D_shr_start
- mov w12, #16
-.Loop_cpy1Dto2D_shr_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_32_sve
- ret
-.vl_gt_16_cpy1Dto2D_shr_32x32:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shr_32x32
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, #64
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shr_32x32:
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #64
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_64x64_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- ld1d {z7.d}, p0/z, [x1, #2, mul vl]
- ld1d {z8.d}, p0/z, [x1, #3, mul vl]
- ld1d {z9.d}, p0/z, [x1, #4, mul vl]
- ld1d {z10.d}, p0/z, [x1, #5, mul vl]
- ld1d {z11.d}, p0/z, [x1, #6, mul vl]
- ld1d {z12.d}, p0/z, [x1, #7, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- add z7.h, p0/m, z7.h, z2.h
- add z8.h, p0/m, z8.h, z2.h
- add z9.h, p0/m, z9.h, z2.h
- add z10.h, p0/m, z10.h, z2.h
- add z11.h, p0/m, z11.h, z2.h
- add z12.h, p0/m, z12.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- asr z7.h, p0/m, z7.h, z0.h
- asr z8.h, p0/m, z8.h, z0.h
- asr z9.h, p0/m, z9.h, z0.h
- asr z10.h, p0/m, z10.h, z0.h
- asr z11.h, p0/m, z11.h, z0.h
- asr z12.h, p0/m, z12.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- st1d {z7.d}, p0, [x0, #2, mul vl]
- st1d {z8.d}, p0, [x0, #3, mul vl]
- st1d {z9.d}, p0, [x0, #4, mul vl]
- st1d {z10.d}, p0, [x0, #5, mul vl]
- st1d {z11.d}, p0, [x0, #6, mul vl]
- st1d {z12.d}, p0, [x0, #7, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shr_64x64:
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- ld1d {z7.d}, p0/z, [x1, #2, mul vl]
- ld1d {z8.d}, p0/z, [x1, #3, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- add z7.h, p0/m, z7.h, z2.h
- add z8.h, p0/m, z8.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- asr z7.h, p0/m, z7.h, z0.h
- asr z8.h, p0/m, z8.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- st1d {z7.d}, p0, [x0, #2, mul vl]
- st1d {z8.d}, p0, [x0, #3, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shr_64x64:
- cmp x9, #112
- bgt .vl_gt_112_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_112_cpy1Dto2D_shr_64x64:
- ptrue p0.h, vl64
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 5118b3ede..00b49df4d 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -22,7 +22,6 @@
*****************************************************************************/
#include "asm.S"
-#include "blockcopy8-common.S"
#ifdef __APPLE__
.section __RODATA,__rodata
@@ -249,101 +248,3 @@ function PFX(count_nonzero_32_neon)
fmov w0, s0
ret
endfunc
-
-function PFX(cpy1Dto2D_shr_4x4_neon)
- cpy1Dto2D_shr_start
- ld1 {v2.16b-v3.16b}, [x1]
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.d}[0], [x0], x2
- st1 {v2.d}[1], [x0], x2
- st1 {v3.d}[0], [x0], x2
- st1 {v3.d}[1], [x0], x2
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_8x8_neon)
- cpy1Dto2D_shr_start
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b}, [x0], x2
- st1 {v3.16b}, [x0], x2
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_16x16_neon)
- cpy1Dto2D_shr_start
- mov w12, #4
-.Loop_cpy1Dto2D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_16
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_32x32_neon)
- cpy1Dto2D_shr_start
- mov w12, #16
-.Loop_cpy1Dto2D_shr_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_32
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_64x64_neon)
- cpy1Dto2D_shr_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy1Dto2D_shr_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sub v16.8h, v16.8h, v1.8h
- sub v17.8h, v17.8h, v1.8h
- sub v18.8h, v18.8h, v1.8h
- sub v19.8h, v19.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_64
- ret
-endfunc
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index b3d657961..575c9cab8 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1438,6 +1438,41 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in
}
}
+template<int size>
+void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+{
+ X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift));
+ int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift));
+ vst1q_s16(dst, d0);
+ }
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift));
+ vst1_s16(dst, d0);
+ }
+
+ src += size;
+ dst += dstStride;
+ }
+}
template<int size>
uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
@@ -1922,6 +1957,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
#endif // HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From e58fa67ac16e6125502c1e903b470fd0a6a2c981 Mon Sep 17 00:00:00 2001
Message-Id: <e58fa67ac16e6125502c1e903b470fd0a6a2c981.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 19 May 2025 11:13:51 +0200
Subject: [PATCH 8/8] AArch64: Implement cpy1Dto2D_shr using Neon intrinsics
Delete the Neon and SVE assembly implementations of these kernels as
they are slower than the new Neon intrinsics implementation.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 12 --
source/common/aarch64/blockcopy8-common.S | 39 ----
source/common/aarch64/blockcopy8-sve.S | 206 ----------------------
source/common/aarch64/blockcopy8.S | 99 -----------
source/common/aarch64/pixel-prim.cpp | 36 ++++
6 files changed, 37 insertions(+), 357 deletions(-)
delete mode 100644 source/common/aarch64/blockcopy8-common.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index a6f56c8c8..7eb40fb05 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -112,7 +112,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
enable_language(ASM)
# Add Arm assembly files here.
- set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 6d4e0b67a..536af1d5d 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy1Dto2D_shr
- p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_neon);
-
// sad
ALL_LUMA_PU(sad, pixel_sad, neon);
ALL_LUMA_PU(sad_x3, sad_x3, neon);
@@ -600,11 +593,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy1Dto2D_shr
- p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
-
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
deleted file mode 100644
index 6d92756fc..000000000
--- a/source/common/aarch64/blockcopy8-common.S
+++ /dev/null
@@ -1,39 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-// This file contains the macros written using NEON instruction set
-// that are also used by the SVE2 functions
-
-#include "asm.S"
-
-.arch armv8-a
-
-// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-.macro cpy1Dto2D_shr_start
- add x2, x2, x2
- dup v0.8h, w3
- cmeq v1.8h, v1.8h, v1.8h
- sshl v1.8h, v1.8h, v0.8h
- sri v1.8h, v1.8h, #1
- neg v0.8h, v0.8h
-.endm
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index e2154414c..401167038 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -22,7 +22,6 @@
*****************************************************************************/
#include "asm-sve.S"
-#include "blockcopy8-common.S"
.arch armv8-a+sve
@@ -69,208 +68,3 @@ function PFX(blockfill_s_32x32_sve)
.endr
ret
endfunc
-
-// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-
-function PFX(cpy1Dto2D_shr_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_16x16
- cpy1Dto2D_shr_start
- mov w12, #4
-.Loop_cpy1Dto2D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_16
- ret
-.vl_gt_16_cpy1Dto2D_shr_16x16:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #32
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_32x32
- cpy1Dto2D_shr_start
- mov w12, #16
-.Loop_cpy1Dto2D_shr_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_32_sve
- ret
-.vl_gt_16_cpy1Dto2D_shr_32x32:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shr_32x32
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, #64
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shr_32x32:
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #64
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_64x64_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- ld1d {z7.d}, p0/z, [x1, #2, mul vl]
- ld1d {z8.d}, p0/z, [x1, #3, mul vl]
- ld1d {z9.d}, p0/z, [x1, #4, mul vl]
- ld1d {z10.d}, p0/z, [x1, #5, mul vl]
- ld1d {z11.d}, p0/z, [x1, #6, mul vl]
- ld1d {z12.d}, p0/z, [x1, #7, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- add z7.h, p0/m, z7.h, z2.h
- add z8.h, p0/m, z8.h, z2.h
- add z9.h, p0/m, z9.h, z2.h
- add z10.h, p0/m, z10.h, z2.h
- add z11.h, p0/m, z11.h, z2.h
- add z12.h, p0/m, z12.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- asr z7.h, p0/m, z7.h, z0.h
- asr z8.h, p0/m, z8.h, z0.h
- asr z9.h, p0/m, z9.h, z0.h
- asr z10.h, p0/m, z10.h, z0.h
- asr z11.h, p0/m, z11.h, z0.h
- asr z12.h, p0/m, z12.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- st1d {z7.d}, p0, [x0, #2, mul vl]
- st1d {z8.d}, p0, [x0, #3, mul vl]
- st1d {z9.d}, p0, [x0, #4, mul vl]
- st1d {z10.d}, p0, [x0, #5, mul vl]
- st1d {z11.d}, p0, [x0, #6, mul vl]
- st1d {z12.d}, p0, [x0, #7, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shr_64x64:
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- ld1d {z7.d}, p0/z, [x1, #2, mul vl]
- ld1d {z8.d}, p0/z, [x1, #3, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- add z7.h, p0/m, z7.h, z2.h
- add z8.h, p0/m, z8.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- asr z7.h, p0/m, z7.h, z0.h
- asr z8.h, p0/m, z8.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- st1d {z7.d}, p0, [x0, #2, mul vl]
- st1d {z8.d}, p0, [x0, #3, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shr_64x64:
- cmp x9, #112
- bgt .vl_gt_112_cpy1Dto2D_shr_64x64
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_112_cpy1Dto2D_shr_64x64:
- ptrue p0.h, vl64
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 64
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, #128
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 5118b3ede..00b49df4d 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -22,7 +22,6 @@
*****************************************************************************/
#include "asm.S"
-#include "blockcopy8-common.S"
#ifdef __APPLE__
.section __RODATA,__rodata
@@ -249,101 +248,3 @@ function PFX(count_nonzero_32_neon)
fmov w0, s0
ret
endfunc
-
-function PFX(cpy1Dto2D_shr_4x4_neon)
- cpy1Dto2D_shr_start
- ld1 {v2.16b-v3.16b}, [x1]
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.d}[0], [x0], x2
- st1 {v2.d}[1], [x0], x2
- st1 {v3.d}[0], [x0], x2
- st1 {v3.d}[1], [x0], x2
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_8x8_neon)
- cpy1Dto2D_shr_start
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b}, [x0], x2
- st1 {v3.16b}, [x0], x2
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_16x16_neon)
- cpy1Dto2D_shr_start
- mov w12, #4
-.Loop_cpy1Dto2D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], #32
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_16
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_32x32_neon)
- cpy1Dto2D_shr_start
- mov w12, #16
-.Loop_cpy1Dto2D_shr_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_32
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shr_64x64_neon)
- cpy1Dto2D_shr_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy1Dto2D_shr_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], #64
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sub v16.8h, v16.8h, v1.8h
- sub v17.8h, v17.8h, v1.8h
- sub v18.8h, v18.8h, v1.8h
- sub v19.8h, v19.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shr_64
- ret
-endfunc
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index b3d657961..575c9cab8 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1438,6 +1438,41 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in
}
}
+template<int size>
+void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
+{
+ X265_CHECK((((intptr_t)dst | (dstStride * sizeof(*dst))) & 15) == 0 || size == 4, "dst alignment error\n");
+ X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift));
+ int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift));
+ vst1q_s16(dst, d0);
+ }
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift));
+ vst1_s16(dst, d0);
+ }
+
+ src += size;
+ dst += dstStride;
+ }
+}
template<int size>
uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
@@ -1922,6 +1957,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shr = cpy1Dto2D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
#endif // HIGH_BIT_DEPTH
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list