[x265] [PATCH 5/8] AArch64: Optimize cpy1Dto2D_shl Neon intrinsics implementation
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:43:15 UTC 2025
Unroll the cpy1Dto2D_shl_neon intrinsics implementation to enable use of
LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
---
source/common/aarch64/asm-primitives.cpp | 22 -----
source/common/aarch64/blockcopy8-sve.S | 102 -----------------------
source/common/aarch64/blockcopy8.S | 86 -------------------
source/common/aarch64/pixel-prim.cpp | 25 ++++--
4 files changed, 19 insertions(+), 216 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 1715ae115..642468124 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -433,19 +433,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
- // cpy1Dto2D_shl
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon);
-
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon);
-
// cpy1Dto2D_shr
p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
@@ -635,15 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
- // cpy1Dto2D_shl
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
// cpy1Dto2D_shr
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index d724e8427..98dfc7584 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -335,108 +335,6 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
ret
endfunc
-// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-
-function PFX(cpy1Dto2D_shl_16x16_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shl_16x16
- ptrue p0.h, vl8
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, #32
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shl_16x16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, #32
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_32x32_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shl_32x32
- ptrue p0.h, vl8
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- ld1h {z3.h}, p0/z, [x1, #2, mul vl]
- ld1h {z4.h}, p0/z, [x1, #3, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- lsl z3.h, p0/m, z3.h, z0.h
- lsl z4.h, p0/m, z4.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- st1h {z3.h}, p0, [x0, #2, mul vl]
- st1h {z4.h}, p0, [x0, #3, mul vl]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shl_32x32:
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shl_32x32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shl_32x32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_64x64_sve)
- dup z0.h, w3
- mov x8, #64
- mov w12, #64
-.L_init_cpy1Dto2D_shl_64x64:
- sub w12, w12, 1
- mov x9, #0
- whilelt p0.h, x9, x8
-.L_cpy1Dto2D_shl_64x64:
- ld1h {z1.h}, p0/z, [x1, x9, lsl #1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0, x9, lsl #1]
- inch x9
- whilelt p0.h, x9, x8
- b.first .L_cpy1Dto2D_shl_64x64
- add x1, x1, #128
- add x0, x0, x2, lsl #1
- cbnz w12, .L_init_cpy1Dto2D_shl_64x64
- ret
-endfunc
-
// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
function PFX(cpy1Dto2D_shr_16x16_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 9db578d1e..f2ca35215 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -403,92 +403,6 @@ function PFX(cpy2Dto1D_shr_32x32_neon)
ret
endfunc
-// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-.macro cpy1Dto2D_shl_start
- add x2, x2, x2
- dup v0.8h, w3
-.endm
-
-function PFX(cpy1Dto2D_shl_4x4_neon)
- cpy1Dto2D_shl_start
- ld1 {v2.16b-v3.16b}, [x1]
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.d}[0], [x0], x2
- st1 {v2.d}[1], [x0], x2
- st1 {v3.d}[0], [x0], x2
- st1 {v3.d}[1], [x0], x2
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_8x8_neon)
- cpy1Dto2D_shl_start
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b}, [x0], x2
- st1 {v3.16b}, [x0], x2
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_16x16_neon)
- cpy1Dto2D_shl_start
- mov w12, #4
-.Loop_cpy1Dto2D_shl_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_16
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_32x32_neon)
- cpy1Dto2D_shl_start
- mov w12, #16
-.Loop_cpy1Dto2D_shl_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_32
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_64x64_neon)
- cpy1Dto2D_shl_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy1Dto2D_shl_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], #64
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_64
- ret
-endfunc
-
function PFX(cpy1Dto2D_shr_4x4_neon)
cpy1Dto2D_shr_start
ld1 {v2.16b-v3.16b}, [x1]
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 055b3e35c..aa91ff407 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1409,17 +1409,30 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in
X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
- for (int i = 0; i < size; i++)
+ for (int h = 0; h < size; h++)
{
- int j = 0;
- for (; (j + 8) <= size; j += 8)
+ for (int w = 0; w + 16 <= size; w += 16)
{
- vst1q_s16(dst + j, vshlq_s16(vld1q_s16(src + j), vdupq_n_s16(shift)));
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vshlq_s16(s0_lo, vdupq_n_s16(shift));
+ int16x8_t d0_hi = vshlq_s16(s0_hi, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
}
- for (; j < size; j++)
+ if (size == 8)
{
- dst[j] = src[j] << shift;
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vshlq_s16(s0, vdupq_n_s16(shift));
+ vst1q_s16(dst, d0);
}
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vshl_s16(s0, vdup_n_s16(shift));
+ vst1_s16(dst, d0);
+ }
+
src += size;
dst += dstStride;
}
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 7ed3a6503314837eaee94c98d4d3920ae9a23db7 Mon Sep 17 00:00:00 2001
Message-Id: <7ed3a6503314837eaee94c98d4d3920ae9a23db7.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 12 May 2025 16:54:40 +0200
Subject: [PATCH 5/8] AArch64: Optimize cpy1Dto2D_shl Neon intrinsics
implementation
Unroll the cpy1Dto2D_shl_neon intrinsics implementation to enable use of
LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
---
source/common/aarch64/asm-primitives.cpp | 22 -----
source/common/aarch64/blockcopy8-sve.S | 102 -----------------------
source/common/aarch64/blockcopy8.S | 86 -------------------
source/common/aarch64/pixel-prim.cpp | 25 ++++--
4 files changed, 19 insertions(+), 216 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 1715ae115..642468124 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -433,19 +433,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_neon);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon);
- // cpy1Dto2D_shl
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon);
-
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_neon);
-
// cpy1Dto2D_shr
p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
@@ -635,15 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
- // cpy1Dto2D_shl
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
// cpy1Dto2D_shr
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index d724e8427..98dfc7584 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -335,108 +335,6 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
ret
endfunc
-// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-
-function PFX(cpy1Dto2D_shl_16x16_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shl_16x16
- ptrue p0.h, vl8
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, #32
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shl_16x16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, #32
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_32x32_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy1Dto2D_shl_32x32
- ptrue p0.h, vl8
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- ld1h {z3.h}, p0/z, [x1, #2, mul vl]
- ld1h {z4.h}, p0/z, [x1, #3, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- lsl z3.h, p0/m, z3.h, z0.h
- lsl z4.h, p0/m, z4.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- st1h {z3.h}, p0, [x0, #2, mul vl]
- st1h {z4.h}, p0, [x0, #3, mul vl]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_16_cpy1Dto2D_shl_32x32:
- cmp x9, #48
- bgt .vl_gt_48_cpy1Dto2D_shl_32x32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-.vl_gt_48_cpy1Dto2D_shl_32x32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, #64
- add x0, x0, x2, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_64x64_sve)
- dup z0.h, w3
- mov x8, #64
- mov w12, #64
-.L_init_cpy1Dto2D_shl_64x64:
- sub w12, w12, 1
- mov x9, #0
- whilelt p0.h, x9, x8
-.L_cpy1Dto2D_shl_64x64:
- ld1h {z1.h}, p0/z, [x1, x9, lsl #1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0, x9, lsl #1]
- inch x9
- whilelt p0.h, x9, x8
- b.first .L_cpy1Dto2D_shl_64x64
- add x1, x1, #128
- add x0, x0, x2, lsl #1
- cbnz w12, .L_init_cpy1Dto2D_shl_64x64
- ret
-endfunc
-
// void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
function PFX(cpy1Dto2D_shr_16x16_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 9db578d1e..f2ca35215 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -403,92 +403,6 @@ function PFX(cpy2Dto1D_shr_32x32_neon)
ret
endfunc
-// void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift)
-.macro cpy1Dto2D_shl_start
- add x2, x2, x2
- dup v0.8h, w3
-.endm
-
-function PFX(cpy1Dto2D_shl_4x4_neon)
- cpy1Dto2D_shl_start
- ld1 {v2.16b-v3.16b}, [x1]
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.d}[0], [x0], x2
- st1 {v2.d}[1], [x0], x2
- st1 {v3.d}[0], [x0], x2
- st1 {v3.d}[1], [x0], x2
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_8x8_neon)
- cpy1Dto2D_shl_start
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b}, [x0], x2
- st1 {v3.16b}, [x0], x2
-.endr
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_16x16_neon)
- cpy1Dto2D_shl_start
- mov w12, #4
-.Loop_cpy1Dto2D_shl_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], #32
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_16
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_32x32_neon)
- cpy1Dto2D_shl_start
- mov w12, #16
-.Loop_cpy1Dto2D_shl_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_32
- ret
-endfunc
-
-function PFX(cpy1Dto2D_shl_64x64_neon)
- cpy1Dto2D_shl_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy1Dto2D_shl_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], #64
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], x2
-.endr
- cbnz w12, .Loop_cpy1Dto2D_shl_64
- ret
-endfunc
-
function PFX(cpy1Dto2D_shr_4x4_neon)
cpy1Dto2D_shr_start
ld1 {v2.16b-v3.16b}, [x1]
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 055b3e35c..aa91ff407 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1409,17 +1409,30 @@ void cpy1Dto2D_shl_neon(int16_t *dst, const int16_t *src, intptr_t dstStride, in
X265_CHECK(((intptr_t)src & 15) == 0, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
- for (int i = 0; i < size; i++)
+ for (int h = 0; h < size; h++)
{
- int j = 0;
- for (; (j + 8) <= size; j += 8)
+ for (int w = 0; w + 16 <= size; w += 16)
{
- vst1q_s16(dst + j, vshlq_s16(vld1q_s16(src + j), vdupq_n_s16(shift)));
+ int16x8_t s0_lo = vld1q_s16(src + w);
+ int16x8_t s0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vshlq_s16(s0_lo, vdupq_n_s16(shift));
+ int16x8_t d0_hi = vshlq_s16(s0_hi, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
}
- for (; j < size; j++)
+ if (size == 8)
{
- dst[j] = src[j] << shift;
+ int16x8_t s0 = vld1q_s16(src);
+ int16x8_t d0 = vshlq_s16(s0, vdupq_n_s16(shift));
+ vst1q_s16(dst, d0);
}
+ if (size == 4)
+ {
+ int16x4_t s0 = vld1_s16(src);
+ int16x4_t d0 = vshl_s16(s0, vdup_n_s16(shift));
+ vst1_s16(dst, d0);
+ }
+
src += size;
dst += dstStride;
}
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list