[x265] [PATCH 3/8] AArch64: Implement blockcopy_ss primitives using Neon intrinsics
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:42:36 UTC 2025
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 28 ---
source/common/aarch64/blockcopy8-sve.S | 207 -----------------------
source/common/aarch64/blockcopy8.S | 120 -------------
source/common/aarch64/pixel-prim.cpp | 40 ++++-
4 files changed, 39 insertions(+), 356 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 463da8319..981c6352a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,13 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Blockcopy_ss
- p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
- p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
- p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -418,16 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
- // chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -656,23 +639,12 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Blockcopy_ss
- p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- // chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 9f9406e6e..976d80dd1 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -166,213 +166,6 @@ function PFX(blockcopy_sp_32x32_sve)
ret
endfunc
-function PFX(blockcopy_ss_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_16_16
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ss_16_16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_32_32
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #4
-.Loop_css32_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32_sve
- ret
-.vl_gt_16_blockcopy_ss_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_32_32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1h {z0.h}, p0, [x0]
- st1h {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-.vl_gt_48_blockcopy_ss_32_32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_64_64
- lsl x1, x1, #1
- sub x1, x1, #64
- lsl x3, x3, #1
- sub x3, x3, #64
- mov w12, #8
-.Loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css64_sve
- ret
-.vl_gt_16_blockcopy_ss_64_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_64_64
- mov w12, #8
- ptrue p0.b, vl32
-.vl_gt_16_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- ld1b {z2.b}, p0/z, [x2, #2, mul vl]
- ld1b {z3.b}, p0/z, [x2, #3, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- st1b {z2.b}, p0, [x0, #2, mul vl]
- st1b {z3.b}, p0, [x0, #3, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_16_loop_css64_sve
- ret
-.vl_gt_48_blockcopy_ss_64_64:
- cmp x9, #112
- bgt .vl_gt_112_blockcopy_ss_64_64
- mov w12, #8
- ptrue p0.b, vl64
-.vl_gt_48_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_48_loop_css64_sve
- ret
-.vl_gt_112_blockcopy_ss_64_64:
- mov w12, #8
- ptrue p0.b, vl128
-.vl_gt_112_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_112_loop_css64_sve
- ret
-endfunc
-
-/******** Chroma blockcopy********/
-function PFX(blockcopy_ss_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_16_32
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ss_16_32:
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_32_64
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #8
-.Loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32x64_sve
- ret
-.vl_gt_16_blockcopy_ss_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_32_64
- mov w12, #8
- ptrue p0.b, vl32
-.vl_gt_32_loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_32_loop_css32x64_sve
- ret
-.vl_gt_48_blockcopy_ss_32_64:
- mov w12, #8
- ptrue p0.b, vl64
-.vl_gt_48_loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_48_loop_css32x64_sve
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_sve)
ptrue p0.h, vl4
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 11685d254..8ac54a1e1 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -123,126 +123,6 @@ function PFX(blockcopy_sp_64x64_neon)
ret
endfunc
-// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
-function PFX(blockcopy_ss_4x4_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 2
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_8x8_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_16x16_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x32_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #4
-.Loop_css32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32
- ret
-endfunc
-
-function PFX(blockcopy_ss_64x64_neon)
- lsl x1, x1, #1
- sub x1, x1, #64
- lsl x3, x3, #1
- sub x3, x3, #64
- mov w12, #8
-.Loop_css64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css64
- ret
-endfunc
-
-/******** Chroma blockcopy********/
-function PFX(blockcopy_ss_4x8_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_8x16_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_16x32_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x64_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #8
-.Loop_css32x64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32x64
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_neon)
lsl x3, x3, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 80678a827..4be409ab1 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1017,6 +1017,35 @@ void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
}
}
+template<int width, int height>
+void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src,
+ intptr_t src_stride)
+{
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t a0 = vld1q_s16(src + w + 0);
+ int16x8_t a1 = vld1q_s16(src + w + 8);
+ vst1q_s16(dst + w + 0, a0);
+ vst1q_s16(dst + w + 8, a1);
+ }
+ if (width & 8)
+ {
+ vst1q_s16(dst + w, vld1q_s16(src + w));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ vst1_s16(dst + w, vld1_s16(src + w));
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
template<int bx, int by>
void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
@@ -1775,6 +1804,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1787,7 +1817,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
- p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1946,12 +1976,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#if HIGH_BIT_DEPTH
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -1959,6 +1991,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -1966,6 +1999,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2046,12 +2080,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#if HIGH_BIT_DEPTH
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
#define CHROMA_CU_S_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2059,6 +2095,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2066,6 +2103,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_S_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 3c23c6b9eb2721f2b4437d81255ad33f402ac06f Mon Sep 17 00:00:00 2001
Message-Id: <3c23c6b9eb2721f2b4437d81255ad33f402ac06f.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Thu, 8 May 2025 19:17:00 +0200
Subject: [PATCH 3/8] AArch64: Implement blockcopy_ss primitives using Neon
intrinsics
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 28 ---
source/common/aarch64/blockcopy8-sve.S | 207 -----------------------
source/common/aarch64/blockcopy8.S | 120 -------------
source/common/aarch64/pixel-prim.cpp | 40 ++++-
4 files changed, 39 insertions(+), 356 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 463da8319..981c6352a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,13 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Blockcopy_ss
- p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
- p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
- p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -418,16 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
- // chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -656,23 +639,12 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Blockcopy_ss
- p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- // chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 9f9406e6e..976d80dd1 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -166,213 +166,6 @@ function PFX(blockcopy_sp_32x32_sve)
ret
endfunc
-function PFX(blockcopy_ss_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_16_16
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ss_16_16:
- ptrue p0.h, vl16
-.rept 16
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_32_32
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #4
-.Loop_css32_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32_sve
- ret
-.vl_gt_16_blockcopy_ss_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_32_32
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1h {z0.h}, p0, [x0]
- st1h {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-.vl_gt_48_blockcopy_ss_32_32:
- ptrue p0.h, vl32
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_64_64
- lsl x1, x1, #1
- sub x1, x1, #64
- lsl x3, x3, #1
- sub x3, x3, #64
- mov w12, #8
-.Loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css64_sve
- ret
-.vl_gt_16_blockcopy_ss_64_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_64_64
- mov w12, #8
- ptrue p0.b, vl32
-.vl_gt_16_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- ld1b {z2.b}, p0/z, [x2, #2, mul vl]
- ld1b {z3.b}, p0/z, [x2, #3, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- st1b {z2.b}, p0, [x0, #2, mul vl]
- st1b {z3.b}, p0, [x0, #3, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_16_loop_css64_sve
- ret
-.vl_gt_48_blockcopy_ss_64_64:
- cmp x9, #112
- bgt .vl_gt_112_blockcopy_ss_64_64
- mov w12, #8
- ptrue p0.b, vl64
-.vl_gt_48_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_48_loop_css64_sve
- ret
-.vl_gt_112_blockcopy_ss_64_64:
- mov w12, #8
- ptrue p0.b, vl128
-.vl_gt_112_loop_css64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_112_loop_css64_sve
- ret
-endfunc
-
-/******** Chroma blockcopy********/
-function PFX(blockcopy_ss_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_16_32
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ss_16_32:
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ss_32_64
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #8
-.Loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32x64_sve
- ret
-.vl_gt_16_blockcopy_ss_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ss_32_64
- mov w12, #8
- ptrue p0.b, vl32
-.vl_gt_32_loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_32_loop_css32x64_sve
- ret
-.vl_gt_48_blockcopy_ss_32_64:
- mov w12, #8
- ptrue p0.b, vl64
-.vl_gt_48_loop_css32x64_sve:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1, lsl #1
-.endr
- cbnz w12, .vl_gt_48_loop_css32x64_sve
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_sve)
ptrue p0.h, vl4
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 11685d254..8ac54a1e1 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -123,126 +123,6 @@ function PFX(blockcopy_sp_64x64_neon)
ret
endfunc
-// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
-function PFX(blockcopy_ss_4x4_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 2
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_8x8_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_16x16_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x32_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #4
-.Loop_css32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32
- ret
-endfunc
-
-function PFX(blockcopy_ss_64x64_neon)
- lsl x1, x1, #1
- sub x1, x1, #64
- lsl x3, x3, #1
- sub x3, x3, #64
- mov w12, #8
-.Loop_css64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css64
- ret
-endfunc
-
-/******** Chroma blockcopy********/
-function PFX(blockcopy_ss_4x8_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- st1 {v0.8b}, [x0], x1
- st1 {v1.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_8x16_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_16x32_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ss_32x64_neon)
- lsl x1, x1, #1
- lsl x3, x3, #1
- mov w12, #8
-.Loop_css32x64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8h-v3.8h}, [x2], x3
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_css32x64
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_neon)
lsl x3, x3, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 80678a827..4be409ab1 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1017,6 +1017,35 @@ void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
}
}
+template<int width, int height>
+void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src,
+ intptr_t src_stride)
+{
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t a0 = vld1q_s16(src + w + 0);
+ int16x8_t a1 = vld1q_s16(src + w + 8);
+ vst1q_s16(dst + w + 0, a0);
+ vst1q_s16(dst + w + 8, a1);
+ }
+ if (width & 8)
+ {
+ vst1q_s16(dst + w, vld1q_s16(src + w));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ vst1_s16(dst + w, vld1_s16(src + w));
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+
template<int bx, int by>
void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
@@ -1775,6 +1804,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1787,7 +1817,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
- p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1946,12 +1976,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#if HIGH_BIT_DEPTH
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -1959,6 +1991,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -1966,6 +1999,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_S_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2046,12 +2080,14 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#if HIGH_BIT_DEPTH
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
#define CHROMA_CU_S_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2059,6 +2095,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2066,6 +2103,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
#define CHROMA_CU_S_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list