[x265] [PATCH 6/8] AArch64: Optimize cpy2Dto1D_shl Neon intrinsics implementation
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:43:40 UTC 2025
The original cpy2Dto1D_shl_neon intrinsics implementation is scalar,
change to use SIMD instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
---
source/common/aarch64/asm-primitives.cpp | 12 ---
source/common/aarch64/blockcopy8-sve.S | 127 -----------------------
source/common/aarch64/blockcopy8.S | 86 ---------------
source/common/aarch64/pixel-prim.cpp | 24 ++++-
4 files changed, 21 insertions(+), 228 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 642468124..e0d8500ef 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy2Dto1D_shl
- p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
-
// cpy2Dto1D_shr
p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
@@ -613,11 +606,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy2Dto1D_shl
- p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
-
// cpy2Dto1D_shr
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 98dfc7584..0e737271f 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -70,133 +70,6 @@ function PFX(blockfill_s_32x32_sve)
ret
endfunc
-// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-.macro cpy2Dto1D_shl_start_sve
- add x2, x2, x2
- mov z0.h, w3
-.endm
-
-function PFX(cpy2Dto1D_shl_16x16_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_16x16
- cpy2Dto1D_shl_start_sve
- mov w12, #4
-.Loop_cpy2Dto1D_shl_16_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_16_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_16x16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, x2, lsl #1
- add x0, x0, #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_32x32_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_32x32
- cpy2Dto1D_shl_start_sve
- mov w12, #16
-.Loop_cpy2Dto1D_shl_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_32_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_32x32:
- cmp x9, #48
- bgt .vl_gt_48_cpy2Dto1D_shl_32x32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, x2, lsl #1
- add x0, x0, #64
-.endr
- ret
-.vl_gt_48_cpy2Dto1D_shl_32x32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, x2, lsl #1
- add x0, x0, #64
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_64x64
- cpy2Dto1D_shl_start_sve
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy2Dto1D_shl_64_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_64_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_64x64:
- dup z0.h, w3
- mov x8, #64
- mov w12, #64
-.L_init_cpy2Dto1D_shl_64x64:
- sub w12, w12, 1
- mov x9, #0
- whilelt p0.h, x9, x8
-.L_cpy2Dto1D_shl_64x64:
- ld1h {z1.h}, p0/z, [x1, x9, lsl #1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0, x9, lsl #1]
- inch x9
- whilelt p0.h, x9, x8
- b.first .L_cpy2Dto1D_shl_64x64
- add x1, x1, x2, lsl #1
- add x0, x0, #128
- cbnz w12, .L_init_cpy2Dto1D_shl_64x64
- ret
-endfunc
-
// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
function PFX(cpy2Dto1D_shr_4x4_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index f2ca35215..fef698cab 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -250,92 +250,6 @@ function PFX(count_nonzero_32_neon)
ret
endfunc
-// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-.macro cpy2Dto1D_shl_start
- add x2, x2, x2
- dup v0.8h, w3
-.endm
-
-function PFX(cpy2Dto1D_shl_4x4_neon)
- cpy2Dto1D_shl_start
- ld1 {v2.d}[0], [x1], x2
- ld1 {v2.d}[1], [x1], x2
- ld1 {v3.d}[0], [x1], x2
- ld1 {v3.d}[1], [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0]
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_8x8_neon)
- cpy2Dto1D_shl_start
-.rept 4
- ld1 {v2.16b}, [x1], x2
- ld1 {v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_16x16_neon)
- cpy2Dto1D_shl_start
- mov w12, #4
-.Loop_cpy2Dto1D_shl_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_16
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_32x32_neon)
- cpy2Dto1D_shl_start
- mov w12, #16
-.Loop_cpy2Dto1D_shl_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_32
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_64x64_neon)
- cpy2Dto1D_shl_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy2Dto1D_shl_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_64
- ret
-endfunc
-
// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
function PFX(cpy2Dto1D_shr_4x4_neon)
cpy2Dto1D_shr_start
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index aa91ff407..a8aa6f420 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1689,11 +1689,29 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
- for (int i = 0; i < size; i++)
+ for (int h = 0; h < size; h++)
{
- for (int j = 0; j < size; j++)
+ int w = 0;
+ for (; w + 16 <= size; w += 16)
+ {
+ int16x8_t a0_lo = vld1q_s16(src + w);
+ int16x8_t a0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vshlq_s16(a0_lo, vdupq_n_s16(shift));
+ int16x8_t d0_hi = vshlq_s16(a0_hi, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t a0 = vld1q_s16(src + w);
+ int16x8_t d0 = vshlq_s16(a0, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0);
+ }
+ if (size == 4)
{
- dst[j] = src[j] << shift;
+ int16x4_t a0 = vld1_s16(src + w);
+ int16x4_t d0 = vshl_s16(a0, vdup_n_s16(shift));
+ vst1_s16(dst + w, d0);
}
src += srcStride;
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 8b367c739ac10b4df2042ffd5ffa078cfca53e77 Mon Sep 17 00:00:00 2001
Message-Id: <8b367c739ac10b4df2042ffd5ffa078cfca53e77.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Tue, 13 May 2025 13:10:00 +0200
Subject: [PATCH 6/8] AArch64: Optimize cpy2Dto1D_shl Neon intrinsics
implementation
The original cpy2Dto1D_shl_neon intrinsics implementation is scalar,
change to use SIMD instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
---
source/common/aarch64/asm-primitives.cpp | 12 ---
source/common/aarch64/blockcopy8-sve.S | 127 -----------------------
source/common/aarch64/blockcopy8.S | 86 ---------------
source/common/aarch64/pixel-prim.cpp | 24 ++++-
4 files changed, 21 insertions(+), 228 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 642468124..e0d8500ef 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -420,13 +420,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
- // cpy2Dto1D_shl
- p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_neon);
- p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_neon);
- p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_neon);
-
// cpy2Dto1D_shr
p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
@@ -613,11 +606,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // cpy2Dto1D_shl
- p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
-
// cpy2Dto1D_shr
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 98dfc7584..0e737271f 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -70,133 +70,6 @@ function PFX(blockfill_s_32x32_sve)
ret
endfunc
-// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-.macro cpy2Dto1D_shl_start_sve
- add x2, x2, x2
- mov z0.h, w3
-.endm
-
-function PFX(cpy2Dto1D_shl_16x16_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_16x16
- cpy2Dto1D_shl_start_sve
- mov w12, #4
-.Loop_cpy2Dto1D_shl_16_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_16_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_16x16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, x2, lsl #1
- add x0, x0, #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_32x32_sve)
- dup z0.h, w3
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_32x32
- cpy2Dto1D_shl_start_sve
- mov w12, #16
-.Loop_cpy2Dto1D_shl_32_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_32_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_32x32:
- cmp x9, #48
- bgt .vl_gt_48_cpy2Dto1D_shl_32x32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- ld1h {z2.h}, p0/z, [x1, #1, mul vl]
- lsl z1.h, p0/m, z1.h, z0.h
- lsl z2.h, p0/m, z2.h, z0.h
- st1h {z1.h}, p0, [x0]
- st1h {z2.h}, p0, [x0, #1, mul vl]
- add x1, x1, x2, lsl #1
- add x0, x0, #64
-.endr
- ret
-.vl_gt_48_cpy2Dto1D_shl_32x32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z1.h}, p0/z, [x1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0]
- add x1, x1, x2, lsl #1
- add x0, x0, #64
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_cpy2Dto1D_shl_64x64
- cpy2Dto1D_shl_start_sve
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy2Dto1D_shl_64_sve:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_64_sve
- ret
-.vl_gt_16_cpy2Dto1D_shl_64x64:
- dup z0.h, w3
- mov x8, #64
- mov w12, #64
-.L_init_cpy2Dto1D_shl_64x64:
- sub w12, w12, 1
- mov x9, #0
- whilelt p0.h, x9, x8
-.L_cpy2Dto1D_shl_64x64:
- ld1h {z1.h}, p0/z, [x1, x9, lsl #1]
- lsl z1.h, p0/m, z1.h, z0.h
- st1h {z1.h}, p0, [x0, x9, lsl #1]
- inch x9
- whilelt p0.h, x9, x8
- b.first .L_cpy2Dto1D_shl_64x64
- add x1, x1, x2, lsl #1
- add x0, x0, #128
- cbnz w12, .L_init_cpy2Dto1D_shl_64x64
- ret
-endfunc
-
// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
function PFX(cpy2Dto1D_shr_4x4_sve)
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index f2ca35215..fef698cab 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -250,92 +250,6 @@ function PFX(count_nonzero_32_neon)
ret
endfunc
-// void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
-.macro cpy2Dto1D_shl_start
- add x2, x2, x2
- dup v0.8h, w3
-.endm
-
-function PFX(cpy2Dto1D_shl_4x4_neon)
- cpy2Dto1D_shl_start
- ld1 {v2.d}[0], [x1], x2
- ld1 {v2.d}[1], [x1], x2
- ld1 {v3.d}[0], [x1], x2
- ld1 {v3.d}[1], [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0]
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_8x8_neon)
- cpy2Dto1D_shl_start
-.rept 4
- ld1 {v2.16b}, [x1], x2
- ld1 {v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_16x16_neon)
- cpy2Dto1D_shl_start
- mov w12, #4
-.Loop_cpy2Dto1D_shl_16:
- sub w12, w12, #1
-.rept 4
- ld1 {v2.16b-v3.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- st1 {v2.16b-v3.16b}, [x0], #32
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_16
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_32x32_neon)
- cpy2Dto1D_shl_start
- mov w12, #16
-.Loop_cpy2Dto1D_shl_32:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_32
- ret
-endfunc
-
-function PFX(cpy2Dto1D_shl_64x64_neon)
- cpy2Dto1D_shl_start
- mov w12, #32
- sub x2, x2, #64
-.Loop_cpy2Dto1D_shl_64:
- sub w12, w12, #1
-.rept 2
- ld1 {v2.16b-v5.16b}, [x1], #64
- ld1 {v16.16b-v19.16b}, [x1], x2
- sshl v2.8h, v2.8h, v0.8h
- sshl v3.8h, v3.8h, v0.8h
- sshl v4.8h, v4.8h, v0.8h
- sshl v5.8h, v5.8h, v0.8h
- sshl v16.8h, v16.8h, v0.8h
- sshl v17.8h, v17.8h, v0.8h
- sshl v18.8h, v18.8h, v0.8h
- sshl v19.8h, v19.8h, v0.8h
- st1 {v2.16b-v5.16b}, [x0], #64
- st1 {v16.16b-v19.16b}, [x0], #64
-.endr
- cbnz w12, .Loop_cpy2Dto1D_shl_64
- ret
-endfunc
-
// void cpy2Dto1D_shr(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift)
function PFX(cpy2Dto1D_shr_4x4_neon)
cpy2Dto1D_shr_start
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index aa91ff407..a8aa6f420 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1689,11 +1689,29 @@ void cpy2Dto1D_shl_neon(int16_t *dst, const int16_t *src, intptr_t srcStride, in
X265_CHECK((((intptr_t)src | (srcStride * sizeof(*src))) & 15) == 0 || size == 4, "src alignment error\n");
X265_CHECK(shift >= 0, "invalid shift\n");
- for (int i = 0; i < size; i++)
+ for (int h = 0; h < size; h++)
{
- for (int j = 0; j < size; j++)
+ int w = 0;
+ for (; w + 16 <= size; w += 16)
+ {
+ int16x8_t a0_lo = vld1q_s16(src + w);
+ int16x8_t a0_hi = vld1q_s16(src + w + 8);
+ int16x8_t d0_lo = vshlq_s16(a0_lo, vdupq_n_s16(shift));
+ int16x8_t d0_hi = vshlq_s16(a0_hi, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0_lo);
+ vst1q_s16(dst + w + 8, d0_hi);
+ }
+ if (size == 8)
+ {
+ int16x8_t a0 = vld1q_s16(src + w);
+ int16x8_t d0 = vshlq_s16(a0, vdupq_n_s16(shift));
+ vst1q_s16(dst + w, d0);
+ }
+ if (size == 4)
{
- dst[j] = src[j] << shift;
+ int16x4_t a0 = vld1_s16(src + w);
+ int16x4_t d0 = vshl_s16(a0, vdup_n_s16(shift));
+ vst1_s16(dst + w, d0);
}
src += srcStride;
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list