[x265] [PATCH 4/8] AArch64: Implement blockcopy_sp primitives using Neon intrinsics
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:42:57 UTC 2025
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 33 ----
source/common/aarch64/blockcopy8-common.S | 6 -
source/common/aarch64/blockcopy8-sve.S | 220 ----------------------
source/common/aarch64/blockcopy8.S | 152 ---------------
source/common/aarch64/pixel-prim.cpp | 38 ++++
5 files changed, 38 insertions(+), 411 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 981c6352a..1715ae115 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,23 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
- p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
-
- // chroma blockcopy_sp
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
-
// Block_fill
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
@@ -639,22 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
-
- // chroma blockcopy_sp
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve);
-
// Block_fill
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
index 2f2ab556d..6599bb49e 100644
--- a/source/common/aarch64/blockcopy8-common.S
+++ b/source/common/aarch64/blockcopy8-common.S
@@ -46,9 +46,3 @@
sri v1.8h, v1.8h, #1
neg v0.8h, v0.8h
.endm
-
-const xtn_xtn2_table, align=4
-.byte 0, 2, 4, 6, 8, 10, 12, 14
-.byte 16, 18, 20, 22, 24, 26, 28, 30
-endconst
-
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 976d80dd1..d724e8427 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -36,226 +36,6 @@
.text
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0 - a
- * r1 - stridea
- * r2 - b
- * r3 - strideb */
-
-function PFX(blockcopy_sp_4x4_sve)
- ptrue p0.h, vl4
-.rept 2
- ld1h {z0.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z0.h}, p0, [x0]
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z1.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_sve)
- ptrue p0.h, vl8
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z0.h}, p0, [x0]
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z1.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_16_16
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_16_16:
- ptrue p0.h, vl16
-.rept 8
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- st1b {z1.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_sve)
- mov w12, #4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_32_32
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32_sve
- ret
-.vl_gt_16_blockcopy_sp_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_sp_32_32
- ptrue p0.h, vl16
-.vl_gt_16_loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z2.h}, p0/z, [x2]
- ld1h {z3.h}, p0/z, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x0]
- st1b {z3.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- cbnz w12, .vl_gt_16_loop_csp32_sve
- ret
-.vl_gt_48_blockcopy_sp_32_32:
- ptrue p0.h, vl32
-.vl_gt_48_loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- st1b {z1.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- cbnz w12, .vl_gt_48_loop_csp32_sve
- ret
-endfunc
-
-// chroma blockcopy_sp
-function PFX(blockcopy_sp_4x8_sve)
- ptrue p0.h, vl4
-.rept 8
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x16_sve)
- ptrue p0.h, vl8
-.rept 16
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_16_32
- ptrue p0.h, vl8
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_16_32:
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_32_64
- ptrue p0.h, vl8
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z2.h}, p0/z, [x2, #2, mul vl]
- ld1h {z3.h}, p0/z, [x2, #3, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- st1b {z2.h}, p0, [x0, #2, mul vl]
- st1b {z3.h}, p0, [x0, #3, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_sp_32_64
- ptrue p0.h, vl16
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_blockcopy_sp_32_64:
- ptrue p0.h, vl32
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
function PFX(blockfill_s_32x32_sve)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 8ac54a1e1..9db578d1e 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -34,158 +34,6 @@
.text
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0 - a
- * r1 - stridea
- * r2 - b
- * r3 - strideb */
-function PFX(blockcopy_sp_4x4_neon)
- lsl x3, x3, #1
-.rept 2
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_neon)
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.d}[0], [x0], x1
- st1 {v1.d}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_neon)
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_neon)
- mov w12, #4
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32
- ret
-endfunc
-
-function PFX(blockcopy_sp_64x64_neon)
- mov w12, #16
- lsl x3, x3, #1
- sub x3, x3, #64
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp64:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp64
- ret
-endfunc
-
-// chroma blockcopy_sp
-function PFX(blockcopy_sp_4x8_neon)
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x16_neon)
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.d}[0], [x0], x1
- st1 {v1.d}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x32_neon)
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x64_neon)
- mov w12, #8
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32x64:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32x64
- ret
-endfunc
-
// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
function PFX(blockfill_s_4x4_neon)
dup v0.4h, w2
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 4be409ab1..055b3e35c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1046,6 +1046,39 @@ void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src,
}
}
+#if !HIGH_BIT_DEPTH
+template<int width, int height>
+void blockcopy_sp_neon(pixel *dst, intptr_t dst_stride, const int16_t *src,
+ intptr_t src_stride)
+{
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16) {
+ int16x8_t s0 = vld1q_s16(src + w + 0);
+ int16x8_t s1 = vld1q_s16(src + w + 8);
+ int8x16_t s01 = vcombine_s8(vmovn_s16(s0), vmovn_s16(s1));
+ vst1q_u8(dst + w, vreinterpretq_u8_s8(s01));
+ }
+ if (width & 8)
+ {
+ int16x8_t s0 = vld1q_s16(src + w);
+ int8x8_t s0_s8 = vmovn_s16(s0);
+ vst1_u8(dst + w, vreinterpret_u8_s8(s0_s8));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ int16x4_t s0 = vld1_s16(src + w);
+ int8x8_t s0_s8 = vmovn_s16(vcombine_s16(s0, vdup_n_s16(0)));
+ store_u8x4x1(dst + w, vreinterpret_u8_s8(s0_s8));
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+#endif // !HIGH_BIT_DEPTH
template<int bx, int by>
void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
@@ -1818,6 +1851,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1992,6 +2026,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2000,6 +2035,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2096,6 +2132,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2104,6 +2141,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 994e584f68865606cdebfafde8331a04f3828983 Mon Sep 17 00:00:00 2001
Message-Id: <994e584f68865606cdebfafde8331a04f3828983.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Thu, 8 May 2025 19:17:55 +0200
Subject: [PATCH 4/8] AArch64: Implement blockcopy_sp primitives using Neon
intrinsics
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 33 ----
source/common/aarch64/blockcopy8-common.S | 6 -
source/common/aarch64/blockcopy8-sve.S | 220 ----------------------
source/common/aarch64/blockcopy8.S | 152 ---------------
source/common/aarch64/pixel-prim.cpp | 38 ++++
5 files changed, 38 insertions(+), 411 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 981c6352a..1715ae115 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,23 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
- p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
-
- // chroma blockcopy_sp
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_neon);
-
// Block_fill
ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
@@ -639,22 +622,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
-
- // chroma blockcopy_sp
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve);
-
// Block_fill
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
diff --git a/source/common/aarch64/blockcopy8-common.S b/source/common/aarch64/blockcopy8-common.S
index 2f2ab556d..6599bb49e 100644
--- a/source/common/aarch64/blockcopy8-common.S
+++ b/source/common/aarch64/blockcopy8-common.S
@@ -46,9 +46,3 @@
sri v1.8h, v1.8h, #1
neg v0.8h, v0.8h
.endm
-
-const xtn_xtn2_table, align=4
-.byte 0, 2, 4, 6, 8, 10, 12, 14
-.byte 16, 18, 20, 22, 24, 26, 28, 30
-endconst
-
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 976d80dd1..d724e8427 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -36,226 +36,6 @@
.text
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0 - a
- * r1 - stridea
- * r2 - b
- * r3 - strideb */
-
-function PFX(blockcopy_sp_4x4_sve)
- ptrue p0.h, vl4
-.rept 2
- ld1h {z0.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z0.h}, p0, [x0]
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z1.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_sve)
- ptrue p0.h, vl8
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z0.h}, p0, [x0]
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- add x2, x2, x3, lsl #1
- st1b {z1.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_16_16
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_16_16:
- ptrue p0.h, vl16
-.rept 8
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- st1b {z1.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_sve)
- mov w12, #4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_32_32
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32_sve
- ret
-.vl_gt_16_blockcopy_sp_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_sp_32_32
- ptrue p0.h, vl16
-.vl_gt_16_loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z2.h}, p0/z, [x2]
- ld1h {z3.h}, p0/z, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x0]
- st1b {z3.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- cbnz w12, .vl_gt_16_loop_csp32_sve
- ret
-.vl_gt_48_blockcopy_sp_32_32:
- ptrue p0.h, vl32
-.vl_gt_48_loop_csp32_sve:
- sub w12, w12, #1
-.rept 4
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
- ld1h {z1.h}, p0/z, [x2]
- st1b {z1.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- cbnz w12, .vl_gt_48_loop_csp32_sve
- ret
-endfunc
-
-// chroma blockcopy_sp
-function PFX(blockcopy_sp_4x8_sve)
- ptrue p0.h, vl4
-.rept 8
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x16_sve)
- ptrue p0.h, vl8
-.rept 16
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_16_32
- ptrue p0.h, vl8
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_16_32:
- ptrue p0.h, vl16
-.rept 32
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_sp_32_64
- ptrue p0.h, vl8
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z2.h}, p0/z, [x2, #2, mul vl]
- ld1h {z3.h}, p0/z, [x2, #3, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- st1b {z2.h}, p0, [x0, #2, mul vl]
- st1b {z3.h}, p0, [x0, #3, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_blockcopy_sp_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_sp_32_64
- ptrue p0.h, vl16
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- ld1h {z1.h}, p0/z, [x2, #1, mul vl]
- st1b {z0.h}, p0, [x0]
- st1b {z1.h}, p0, [x0, #1, mul vl]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_blockcopy_sp_32_64:
- ptrue p0.h, vl32
-.rept 64
- ld1h {z0.h}, p0/z, [x2]
- st1b {z0.h}, p0, [x0]
- add x2, x2, x3, lsl #1
- add x0, x0, x1
-.endr
- ret
-endfunc
-
function PFX(blockfill_s_32x32_sve)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 8ac54a1e1..9db578d1e 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -34,158 +34,6 @@
.text
-/* void blockcopy_sp(pixel* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
- *
- * r0 - a
- * r1 - stridea
- * r2 - b
- * r3 - strideb */
-function PFX(blockcopy_sp_4x4_neon)
- lsl x3, x3, #1
-.rept 2
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x8_neon)
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.d}[0], [x0], x1
- st1 {v1.d}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x16_neon)
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 8
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x32_neon)
- mov w12, #4
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32
- ret
-endfunc
-
-function PFX(blockcopy_sp_64x64_neon)
- mov w12, #16
- lsl x3, x3, #1
- sub x3, x3, #64
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp64:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], #64
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp64
- ret
-endfunc
-
-// chroma blockcopy_sp
-function PFX(blockcopy_sp_4x8_neon)
- lsl x3, x3, #1
-.rept 4
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_8x16_neon)
- lsl x3, x3, #1
-.rept 8
- ld1 {v0.8h}, [x2], x3
- ld1 {v1.8h}, [x2], x3
- xtn v0.8b, v0.8h
- xtn v1.8b, v1.8h
- st1 {v0.d}[0], [x0], x1
- st1 {v1.d}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_16x32_neon)
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.rept 16
- ld1 {v0.8h-v1.8h}, [x2], x3
- ld1 {v2.8h-v3.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- st1 {v0.16b}, [x0], x1
- st1 {v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_sp_32x64_neon)
- mov w12, #8
- lsl x3, x3, #1
- movrel x11, xtn_xtn2_table
- ld1 {v31.16b}, [x11]
-.Loop_csp32x64:
- sub w12, w12, #1
-.rept 4
- ld1 {v0.8h-v3.8h}, [x2], x3
- ld1 {v4.8h-v7.8h}, [x2], x3
- tbl v0.16b, {v0.16b,v1.16b}, v31.16b
- tbl v1.16b, {v2.16b,v3.16b}, v31.16b
- tbl v2.16b, {v4.16b,v5.16b}, v31.16b
- tbl v3.16b, {v6.16b,v7.16b}, v31.16b
- st1 {v0.16b-v1.16b}, [x0], x1
- st1 {v2.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_csp32x64
- ret
-endfunc
-
// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
function PFX(blockfill_s_4x4_neon)
dup v0.4h, w2
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 4be409ab1..055b3e35c 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1046,6 +1046,39 @@ void blockcopy_ss_neon(int16_t *dst, intptr_t dst_stride, const int16_t *src,
}
}
+#if !HIGH_BIT_DEPTH
+template<int width, int height>
+void blockcopy_sp_neon(pixel *dst, intptr_t dst_stride, const int16_t *src,
+ intptr_t src_stride)
+{
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16) {
+ int16x8_t s0 = vld1q_s16(src + w + 0);
+ int16x8_t s1 = vld1q_s16(src + w + 8);
+ int8x16_t s01 = vcombine_s8(vmovn_s16(s0), vmovn_s16(s1));
+ vst1q_u8(dst + w, vreinterpretq_u8_s8(s01));
+ }
+ if (width & 8)
+ {
+ int16x8_t s0 = vld1q_s16(src + w);
+ int8x8_t s0_s8 = vmovn_s16(s0);
+ vst1_u8(dst + w, vreinterpret_u8_s8(s0_s8));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ int16x4_t s0 = vld1_s16(src + w);
+ int8x8_t s0_s8 = vmovn_s16(vcombine_s16(s0, vdup_n_s16(0)));
+ store_u8x4x1(dst + w, vreinterpret_u8_s8(s0_s8));
+ }
+
+ dst += dst_stride;
+ src += src_stride;
+ }
+}
+#endif // !HIGH_BIT_DEPTH
template<int bx, int by>
void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixel *b1, intptr_t sstride0,
@@ -1818,6 +1851,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
@@ -1992,6 +2026,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2000,6 +2035,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2096,6 +2132,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
@@ -2104,6 +2141,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_sp = blockcopy_sp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list