[x265] [PATCH 7/8] AArch64: Implement cpy2Dto1D_shr using Neon intrinsics
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:43:54 UTC 2025
Delete the Neon and SVE assembly implementations of these kernels as
they are slower than the new Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 10 --
source/common/aarch64/blockcopy8-common.S | 9 --
source/common/aarch64/blockcopy8-sve.S | 138 ----------------------
source/common/aarch64/blockcopy8.S | 67 -----------
source/common/aarch64/pixel-prim.cpp | 36 ++++++
5 files changed, 36 insertions(+), 224 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index e0d8500ef..6d4e0b67a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,12 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy2Dto1D_shr
- p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
- p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
-
// cpy1Dto2D_shr
p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
@@ -606,10 +600,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy2Dto1D_shr
- p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
-
// cpy1Dto2D_shr
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
index 6599bb49e..6d92756fc 100644
--- a/source/common/aarch64/blockcopy8-common.S
+++ b/source/common/aarch64/blockcopy8-common.S
@@ -37,12 +37,3 @@
sri v1.8h, v1.8h, #1
neg v0.8h, v0.8h
.endm
-
-.macro cpy2Dto1D_shr_start
- add x2, x2, x2
- dup v0.8h, w3
- cmeq v1.8h, v1.8h, v1.8h
- sshl v1.8h, v1.8h, v0.8h
- sri v1.8h, v1.8h, #1
- neg v0.8h, v0.8h
-.endm
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 0e737271f..e2154414c 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -70,144 +70,6 @@ function PFX(blockfill_s_32x32_sve)
ret
endfunc
-// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-
-function PFX(cpy2Dto1D_shr_4x4_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
- lsl x2, x2, #1
- index z3.d, #0, x2
- index z4.d, #0, #8
-.rept 2
- ld1d {z5.d}, p0/z, [x1, z3.d]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0, z4.d]
- add x0, x0, #16
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_8x8_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 8
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #16
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_16x16_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shr_16x16
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, #32
-.endr
- ret
-.vl_gt_16_cpy2Dto1D_shr_16x16:
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shr_32x32
- cpy2Dto1D_shr_start
- mov w12, #16
-.Loop_cpy2Dto1D_shr_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.8h-v5.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.8h-v5.8h}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_32_sve
- ret
-.vl_gt_16_cpy2Dto1D_shr_32x32:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- cmp x9, #48
- bgt .vl_gt_48_cpy2Dto1D_shr_32x32
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, #64
-.endr
- ret
-.vl_gt_48_cpy2Dto1D_shr_32x32:
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #64
-.endr
- ret
-endfunc
-
// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
function PFX(cpy1Dto2D_shr_16x16_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index fef698cab..5118b3ede 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -250,73 +250,6 @@ function PFX(count_nonzero_32_neon)
ret
endfunc
-// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-function PFX(cpy2Dto1D_shr_4x4_neon)
- cpy2Dto1D_shr_start
- ld1 {v2.d}[0], [x1], x2
- ld1 {v2.d}[1], [x1], x2
- ld1 {v3.d}[0], [x1], x2
- ld1 {v3.d}[1], [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- stp q2, q3, [x0]
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_8x8_neon)
- cpy2Dto1D_shr_start
-.rept 4
- ld1 {v2.16b}, [x1], x2
- ld1 {v3.16b}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- stp q2, q3, [x0], #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_16x16_neon)
- cpy2Dto1D_shr_start
- mov w12, #4
-.Loop_cpy2Dto1D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_16
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_32x32_neon)
- cpy2Dto1D_shr_start
- mov w12, #16
-.Loop_cpy2Dto1D_shr_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.8h-v5.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.8h-v5.8h}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_32
- ret
-endfunc
-
function PFX(cpy1Dto2D_shr_4x4_neon)
cpy1Dto2D_shr_start
ld1 {v2.16b-v3.16b}, [x1]
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index a8aa6f420..b3d657961 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1719,6 +1719,41 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
}
}
+template<int size>
+void cpy2Dto1D_shr_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
+{
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift));
+ int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift));
+ vst1q_s16(dst, d0);
+ }
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift));
+ vst1_s16(dst, d0);
+ }
+
+ src += srcStride;
+ dst += size;
+ }
+}
template<int w, int h>
int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
@@ -1884,6 +1919,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 69d137e913d6b470b25d214655e9e828cecf711d Mon Sep 17 00:00:00 2001
Message-Id: <69d137e913d6b470b25d214655e9e828cecf711d.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Sat, 17 May 2025 17:40:07 +0200
Subject: [PATCH 7/8] AArch64: Implement cpy2Dto1D_shr using Neon intrinsics
Delete the Neon and SVE assembly implementations of these kernels as
they are slower than the new Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 10 --
source/common/aarch64/blockcopy8-common.S | 9 --
source/common/aarch64/blockcopy8-sve.S | 138 ----------------------
source/common/aarch64/blockcopy8.S | 67 -----------
source/common/aarch64/pixel-prim.cpp | 36 ++++++
5 files changed, 36 insertions(+), 224 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index e0d8500ef..6d4e0b67a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,12 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy2Dto1D_shr
- p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
- p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
-
// cpy1Dto2D_shr
p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
@@ -606,10 +600,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy2Dto1D_shr
- p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
-
// cpy1Dto2D_shr
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
index 6599bb49e..6d92756fc 100644
--- a/source/common/aarch64/blockcopy8-common.S
+++ b/source/common/aarch64/blockcopy8-common.S
@@ -37,12 +37,3 @@
sri v1.8h, v1.8h, #1
neg v0.8h, v0.8h
.endm
-
-.macro cpy2Dto1D_shr_start
- add x2, x2, x2
- dup v0.8h, w3
- cmeq v1.8h, v1.8h, v1.8h
- sshl v1.8h, v1.8h, v0.8h
- sri v1.8h, v1.8h, #1
- neg v0.8h, v0.8h
-.endm
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 0e737271f..e2154414c 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -70,144 +70,6 @@ function PFX(blockfill_s_32x32_sve)
ret
endfunc
-// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-
-function PFX(cpy2Dto1D_shr_4x4_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
- lsl x2, x2, #1
- index z3.d, #0, x2
- index z4.d, #0, #8
-.rept 2
- ld1d {z5.d}, p0/z, [x1, z3.d]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0, z4.d]
- add x0, x0, #16
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_8x8_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 8
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #16
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_16x16_sve)
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shr_16x16
- ptrue p0.h, vl8
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, #32
-.endr
- ret
-.vl_gt_16_cpy2Dto1D_shr_16x16:
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 16
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shr_32x32
- cpy2Dto1D_shr_start
- mov w12, #16
-.Loop_cpy2Dto1D_shr_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.8h-v5.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.8h-v5.8h}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_32_sve
- ret
-.vl_gt_16_cpy2Dto1D_shr_32x32:
- dup z0.h, w3
- sub w4, w3, #1
- dup z1.h, w4
- cmp x9, #48
- bgt .vl_gt_48_cpy2Dto1D_shr_32x32
- ptrue p0.h, vl16
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- ld1d {z6.d}, p0/z, [x1, #1, mul vl]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- add z6.h, p0/m, z6.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- asr z6.h, p0/m, z6.h, z0.h
- st1d {z5.d}, p0, [x0]
- st1d {z6.d}, p0, [x0, #1, mul vl]
- add x0, x0, #64
-.endr
- ret
-.vl_gt_48_cpy2Dto1D_shr_32x32:
- ptrue p0.h, vl32
- mov z2.h, #1
- lsl z2.h, p0/m, z2.h, z1.h
-.rept 32
- ld1d {z5.d}, p0/z, [x1]
- add x1, x1, x2, lsl #1
- add z5.h, p0/m, z5.h, z2.h
- asr z5.h, p0/m, z5.h, z0.h
- st1d {z5.d}, p0, [x0]
- add x0, x0, #64
-.endr
- ret
-endfunc
-
// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
function PFX(cpy1Dto2D_shr_16x16_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index fef698cab..5118b3ede 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -250,73 +250,6 @@ function PFX(count_nonzero_32_neon)
ret
endfunc
-// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-function PFX(cpy2Dto1D_shr_4x4_neon)
- cpy2Dto1D_shr_start
- ld1 {v2.d}[0], [x1], x2
- ld1 {v2.d}[1], [x1], x2
- ld1 {v3.d}[0], [x1], x2
- ld1 {v3.d}[1], [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- stp q2, q3, [x0]
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_8x8_neon)
- cpy2Dto1D_shr_start
-.rept 4
- ld1 {v2.16b}, [x1], x2
- ld1 {v3.16b}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- stp q2, q3, [x0], #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_16x16_neon)
- cpy2Dto1D_shr_start
- mov w12, #4
-.Loop_cpy2Dto1D_shr_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.8h-v3.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.8h-v3.8h}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_16
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shr_32x32_neon)
- cpy2Dto1D_shr_start
- mov w12, #16
-.Loop_cpy2Dto1D_shr_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.8h-v5.8h}, [x1], x2
- sub v2.8h, v2.8h, v1.8h
- sub v3.8h, v3.8h, v1.8h
- sub v4.8h, v4.8h, v1.8h
- sub v5.8h, v5.8h, v1.8h
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.8h-v5.8h}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shr_32
- ret
-endfunc
-
function PFX(cpy1Dto2D_shr_4x4_neon)
cpy1Dto2D_shr_start
ld1 {v2.16b-v3.16b}, [x1]
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index a8aa6f420..b3d657961 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1719,6 +1719,41 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
}
}
+template<int size>
+void cpy2Dto1D_shr_neon(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
+{
+ X265_CHECK(((intptr_t)dst & 15) == 0, "dst alignment error\n");
+ X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
+ X265_CHECK(shift > 0, "invalid shift\n");
+
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vrshlq_s16(s0_lo, vdupq_n_s16(-shift));
+ int16x8_t d0_hi = vrshlq_s16(s0_hi, vdupq_n_s16(-shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vrshlq_s16(s0, vdupq_n_s16(-shift));
+ vst1q_s16(dst, d0);
+ }
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vrshl_s16(s0, vdup_n_s16(-shift));
+ vst1_s16(dst, d0);
+ }
+
+ src += srcStride;
+ dst += size;
+ }
+}
template<int w, int h>
int satd4_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
@@ -1884,6 +1919,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list