[x265] [PATCH 1/8] AArch64: Optimize blockcopy_pp_neon intrinsics implementation
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:42:00 UTC 2025
Unroll the blockcopy_pp_neon intrinsics implementation to enable use
of LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 33 ---
source/common/aarch64/blockcopy8-sve.S | 107 ---------
source/common/aarch64/blockcopy8.S | 286 -----------------------
source/common/aarch64/pixel-prim.cpp | 74 +++---
4 files changed, 48 insertions(+), 452 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 2b6d3f812..2c6911d8b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,27 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
-#if !HIGH_BIT_DEPTH
- // Blockcopy_pp
- ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
- ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
- ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
- p.cu[BLOCK_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.cu[BLOCK_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.cu[BLOCK_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
- p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
-
-#endif // !HIGH_BIT_DEPTH
-
// Blockcopy_ss
p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
@@ -694,18 +673,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
-#if !HIGH_BIT_DEPTH
- // Blockcopy_pp
- LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_sve);
-
-#endif // !HIGH_BIT_DEPTH
-
// Blockcopy_ss
p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 5bbb5e70d..e80722654 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -702,113 +702,6 @@ function PFX(blockcopy_sp_32x64_sve)
ret
endfunc
-/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
-
-function PFX(blockcopy_pp_32x8_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_32_8
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_pp_32_8:
- ptrue p0.b, vl32
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-.macro blockcopy_pp_32xN_sve h
-function PFX(blockcopy_pp_32x\h\()_sve)
- mov w12, #\h / 8
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_32xN_\h
-.Loop_sve_32x\h\():
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_sve_32x\h
- ret
-.vl_gt_16_blockcopy_pp_32xN_\h:
- ptrue p0.b, vl32
-.L_gt_16_blockcopy_pp_32xN_\h:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_gt_16_blockcopy_pp_32xN_\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_32xN_sve 16
-blockcopy_pp_32xN_sve 24
-blockcopy_pp_32xN_sve 32
-blockcopy_pp_32xN_sve 64
-blockcopy_pp_32xN_sve 48
-
-.macro blockcopy_pp_64xN_sve h
-function PFX(blockcopy_pp_64x\h\()_sve)
- mov w12, #\h / 4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_64xN_\h
-.Loop_sve_64x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v3.16b}, [x2], x3
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_sve_64x\h
- ret
-.vl_gt_16_blockcopy_pp_64xN_\h:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_pp_64xN_\h
- ptrue p0.b, vl32
-.L_le_32_blockcopy_pp_64xN_\h:
- sub w12, w12, #1
-.rept 4
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_le_32_blockcopy_pp_64xN_\h
- ret
-.vl_gt_48_blockcopy_pp_64xN_\h:
- ptrue p0.b, vl64
-.L_blockcopy_pp_64xN_\h:
- sub w12, w12, #1
-.rept 4
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_blockcopy_pp_64xN_\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_64xN_sve 16
-blockcopy_pp_64xN_sve 32
-blockcopy_pp_64xN_sve 48
-blockcopy_pp_64xN_sve 64
-
function PFX(blockfill_s_32x32_sve)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 1ad371c57..d466f8ea8 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -459,292 +459,6 @@ function PFX(blockcopy_sp_32x64_neon)
ret
endfunc
-/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
-
-function PFX(blockcopy_pp_2x4_neon)
- ldrh w9, [x2]
- add x4, x1, x1
- add x14, x3, x3
- strh w9, [x0]
- ldrh w10, [x2, x3]
- add x5, x4, x1
- add x15, x14, x3
- strh w10, [x0, x1]
- ldrh w11, [x2, x14]
- strh w11, [x0, x4]
- ldrh w12, [x2, x15]
- strh w12, [x0, x5]
- ret
-endfunc
-
-.macro blockcopy_pp_2xN_neon h
-function PFX(blockcopy_pp_2x\h\()_neon)
- add x4, x1, x1
- add x5, x4, x1
- add x6, x5, x1
-
- add x14, x3, x3
- add x15, x14, x3
- add x16, x15, x3
-
-.rept \h / 4
- ldrh w9, [x2]
- strh w9, [x0]
- ldrh w10, [x2, x3]
- strh w10, [x0, x1]
- ldrh w11, [x2, x14]
- strh w11, [x0, x4]
- ldrh w12, [x2, x15]
- strh w12, [x0, x5]
- add x2, x2, x16
- add x0, x0, x6
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_2xN_neon 8
-blockcopy_pp_2xN_neon 16
-
-function PFX(blockcopy_pp_4x2_neon)
- ldr w9, [x2]
- str w9, [x0]
- ldr w10, [x2, x3]
- str w10, [x0, x1]
- ret
-endfunc
-
-function PFX(blockcopy_pp_4x4_neon)
- ldr w9, [x2]
- add x4, x1, x1
- add x14, x3, x3
- str w9, [x0]
- ldr w10, [x2, x3]
- add x5, x4, x1
- add x15, x14, x3
- str w10, [x0, x1]
- ldr w11, [x2, x14]
- str w11, [x0, x4]
- ldr w12, [x2, x15]
- str w12, [x0, x5]
- ret
-endfunc
-
-.macro blockcopy_pp_4xN_neon h
-function PFX(blockcopy_pp_4x\h\()_neon)
- add x4, x1, x1
- add x5, x4, x1
- add x6, x5, x1
-
- add x14, x3, x3
- add x15, x14, x3
- add x16, x15, x3
-
-.rept \h / 4
- ldr w9, [x2]
- str w9, [x0]
- ldr w10, [x2, x3]
- str w10, [x0, x1]
- ldr w11, [x2, x14]
- str w11, [x0, x4]
- ldr w12, [x2, x15]
- str w12, [x0, x5]
- add x2, x2, x16
- add x0, x0, x6
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_4xN_neon 8
-blockcopy_pp_4xN_neon 16
-blockcopy_pp_4xN_neon 32
-
-.macro blockcopy_pp_6xN_neon h
-function PFX(blockcopy_pp_6x\h\()_neon)
- sub x1, x1, #4
-.rept \h
- ld1 {v0.8b}, [x2], x3
- st1 {v0.s}[0], [x0], #4
- st1 {v0.h}[2], [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_6xN_neon 8
-blockcopy_pp_6xN_neon 16
-
-.macro blockcopy_pp_8xN_neon h
-function PFX(blockcopy_pp_8x\h\()_neon)
-.rept \h
- ld1 {v0.4h}, [x2], x3
- st1 {v0.4h}, [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_8xN_neon 2
-blockcopy_pp_8xN_neon 4
-blockcopy_pp_8xN_neon 6
-blockcopy_pp_8xN_neon 8
-blockcopy_pp_8xN_neon 12
-blockcopy_pp_8xN_neon 16
-blockcopy_pp_8xN_neon 32
-
-function PFX(blockcopy_pp_8x64_neon)
- mov w12, #4
-.Loop_pp_8x64:
- sub w12, w12, #1
-.rept 16
- ld1 {v0.4h}, [x2], x3
- st1 {v0.4h}, [x0], x1
-.endr
- cbnz w12, .Loop_pp_8x64
- ret
-endfunc
-
-.macro blockcopy_pp_16xN_neon h
-function PFX(blockcopy_pp_16x\h\()_neon)
-.rept \h
- ld1 {v0.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_16xN_neon 4
-blockcopy_pp_16xN_neon 8
-blockcopy_pp_16xN_neon 12
-blockcopy_pp_16xN_neon 16
-
-.macro blockcopy_pp_16xN1_neon h
-function PFX(blockcopy_pp_16x\h\()_neon)
- mov w12, #\h / 8
-.Loop_16x\h\():
-.rept 8
- ld1 {v0.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
-.endr
- sub w12, w12, #1
- cbnz w12, .Loop_16x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_16xN1_neon 24
-blockcopy_pp_16xN1_neon 32
-blockcopy_pp_16xN1_neon 64
-
-function PFX(blockcopy_pp_12x16_neon)
- sub x1, x1, #8
-.rept 16
- ld1 {v0.16b}, [x2], x3
- str d0, [x0], #8
- st1 {v0.s}[2], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_pp_12x32_neon)
- sub x1, x1, #8
- mov w12, #4
-.Loop_pp_12x32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b}, [x2], x3
- str d0, [x0], #8
- st1 {v0.s}[2], [x0], x1
-.endr
- cbnz w12, .Loop_pp_12x32
- ret
-endfunc
-
-function PFX(blockcopy_pp_24x32_neon)
- mov w12, #4
-.Loop_24x32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8b-v2.8b}, [x2], x3
- st1 {v0.8b-v2.8b}, [x0], x1
-.endr
- cbnz w12, .Loop_24x32
- ret
-endfunc
-
-function PFX(blockcopy_pp_24x64_neon)
- mov w12, #4
-.Loop_24x64:
- sub w12, w12, #1
-.rept 16
- ld1 {v0.8b-v2.8b}, [x2], x3
- st1 {v0.8b-v2.8b}, [x0], x1
-.endr
- cbnz w12, .Loop_24x64
- ret
-endfunc
-
-function PFX(blockcopy_pp_32x8_neon)
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-.macro blockcopy_pp_32xN_neon h
-function PFX(blockcopy_pp_32x\h\()_neon)
- mov w12, #\h / 8
-.Loop_32x\h\():
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_32x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_32xN_neon 16
-blockcopy_pp_32xN_neon 24
-blockcopy_pp_32xN_neon 32
-blockcopy_pp_32xN_neon 64
-blockcopy_pp_32xN_neon 48
-
-function PFX(blockcopy_pp_48x64_neon)
- mov w12, #8
-.Loop_48x64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v2.16b}, [x2], x3
- st1 {v0.16b-v2.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_48x64
- ret
-endfunc
-
-.macro blockcopy_pp_64xN_neon h
-function PFX(blockcopy_pp_64x\h\()_neon)
- mov w12, #\h / 4
-.Loop_64x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v3.16b}, [x2], x3
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_64x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_64xN_neon 16
-blockcopy_pp_64xN_neon 32
-blockcopy_pp_64xN_neon 48
-blockcopy_pp_64xN_neon 64
-
// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
function PFX(blockfill_s_4x4_neon)
dup v0.4h, w2
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 4ba0ad1eb..9afd9f913 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -942,45 +942,67 @@ void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t st
}
-template<int bx, int by>
-void blockcopy_pp_neon(pixel *a, intptr_t stridea, const pixel *b, intptr_t strideb)
+template<int width, int height>
+void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
+ intptr_t src_stride)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
+ int w = 0;
#if HIGH_BIT_DEPTH
- for (; (x + 8) <= bx; x += 8)
+ for (; w + 16 <= width; w += 16)
{
- vst1q_u16(a + x, vld1q_u16(b + x));
+ uint16x8_t s0_lo = vld1q_u16(src + w);
+ uint16x8_t s0_hi = vld1q_u16(src + w + 8);
+ vst1q_u16(dst + w, s0_lo);
+ vst1q_u16(dst + w + 8, s0_hi);
}
- if (bx & 4)
+ if (width & 8)
{
- vst1_u16(a + x, vld1_u16(b + x));
- x += 4;
+ uint16x8_t s0 = vld1q_u16(src + w);
+ vst1q_u16(dst + w, s0);
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint16x4_t s0 = vld1_u16(src + w);
+ vst1_u16(dst + w, s0);
+ w += 4;
}
#else
- for (; (x + 16) <= bx; x += 16)
+ for (; w + 32 <= width; w += 32)
{
- vst1q_u8(a + x, vld1q_u8(b + x));
+ uint8x16_t s0_lo = vld1q_u8(src + w);
+ uint8x16_t s0_hi = vld1q_u8(src + w + 16);
+ vst1q_u8(dst + w, s0_lo);
+ vst1q_u8(dst + w + 16, s0_hi);
}
- if (bx & 8)
+ if (width & 16)
{
- vst1_u8(a + x, vld1_u8(b + x));
- x += 8;
+ uint8x16_t s0 = vld1q_u8(src + w);
+ vst1q_u8(dst + w, s0);
+ w += 16;
}
- if (bx & 4)
+ if (width & 8)
{
- store_u8x4x1(a + x, load_u8x4x1(b + x));
- x += 4;
+ uint8x8_t s0 = vld1_u8(src + w);
+ vst1_u8(dst + w, s0);
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint8x8_t s0 = load_u8x4x1(src + w);
+ store_u8x4x1(dst + w, s0);
+ w += 4;
}
#endif
- for (; x < bx; x++)
+ for (; w < width; w++)
{
- a[x] = b[x];
+ dst[w] = src[w];
}
- a += stridea;
- b += strideb;
+ src += src_stride;
+ dst += dst_stride;
}
}
@@ -1816,11 +1838,11 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
#endif // !(HIGH_BIT_DEPTH)
- p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
+ p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
p.cu[BLOCK_16x16].blockfill_s[ALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
+ p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
+ p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
p.cu[BLOCK_64x64].blockfill_s[ALIGNED] = blockfill_s_neon<64>;
@@ -1999,8 +2021,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
-
+
+
CHROMA_CU_S_422(4, 8)
CHROMA_CU_422(8, 16)
CHROMA_CU_422(16, 32)
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 7712e4effb5af62c0214b8b206d67f8aaa32cce4 Mon Sep 17 00:00:00 2001
Message-Id: <7712e4effb5af62c0214b8b206d67f8aaa32cce4.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 3 Feb 2025 12:12:59 +0000
Subject: [PATCH 1/8] AArch64: Optimize blockcopy_pp_neon intrinsics
implementation
Unroll the blockcopy_pp_neon intrinsics implementation to enable use
of LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 33 ---
source/common/aarch64/blockcopy8-sve.S | 107 ---------
source/common/aarch64/blockcopy8.S | 286 -----------------------
source/common/aarch64/pixel-prim.cpp | 74 +++---
4 files changed, 48 insertions(+), 452 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 2b6d3f812..2c6911d8b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,27 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
-#if !HIGH_BIT_DEPTH
- // Blockcopy_pp
- ALL_LUMA_PU(copy_pp, blockcopy_pp, neon);
- ALL_CHROMA_420_PU(copy_pp, blockcopy_pp, neon);
- ALL_CHROMA_422_PU(copy_pp, blockcopy_pp, neon);
- p.cu[BLOCK_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.cu[BLOCK_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.cu[BLOCK_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
- p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_neon);
-
-#endif // !HIGH_BIT_DEPTH
-
// Blockcopy_ss
p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
@@ -694,18 +673,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
-#if !HIGH_BIT_DEPTH
- // Blockcopy_pp
- LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_sve);
-
-#endif // !HIGH_BIT_DEPTH
-
// Blockcopy_ss
p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 5bbb5e70d..e80722654 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -702,113 +702,6 @@ function PFX(blockcopy_sp_32x64_sve)
ret
endfunc
-/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
-
-function PFX(blockcopy_pp_32x8_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_32_8
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_pp_32_8:
- ptrue p0.b, vl32
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-.macro blockcopy_pp_32xN_sve h
-function PFX(blockcopy_pp_32x\h\()_sve)
- mov w12, #\h / 8
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_32xN_\h
-.Loop_sve_32x\h\():
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_sve_32x\h
- ret
-.vl_gt_16_blockcopy_pp_32xN_\h:
- ptrue p0.b, vl32
-.L_gt_16_blockcopy_pp_32xN_\h:
- sub w12, w12, #1
-.rept 8
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_gt_16_blockcopy_pp_32xN_\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_32xN_sve 16
-blockcopy_pp_32xN_sve 24
-blockcopy_pp_32xN_sve 32
-blockcopy_pp_32xN_sve 64
-blockcopy_pp_32xN_sve 48
-
-.macro blockcopy_pp_64xN_sve h
-function PFX(blockcopy_pp_64x\h\()_sve)
- mov w12, #\h / 4
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_pp_64xN_\h
-.Loop_sve_64x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v3.16b}, [x2], x3
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_sve_64x\h
- ret
-.vl_gt_16_blockcopy_pp_64xN_\h:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_pp_64xN_\h
- ptrue p0.b, vl32
-.L_le_32_blockcopy_pp_64xN_\h:
- sub w12, w12, #1
-.rept 4
- ld1b {z0.b}, p0/z, [x2]
- ld1b {z1.b}, p0/z, [x2, #1, mul vl]
- st1b {z0.b}, p0, [x0]
- st1b {z1.b}, p0, [x0, #1, mul vl]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_le_32_blockcopy_pp_64xN_\h
- ret
-.vl_gt_48_blockcopy_pp_64xN_\h:
- ptrue p0.b, vl64
-.L_blockcopy_pp_64xN_\h:
- sub w12, w12, #1
-.rept 4
- ld1b {z0.b}, p0/z, [x2]
- st1b {z0.b}, p0, [x0]
- add x2, x2, x3
- add x0, x0, x1
-.endr
- cbnz w12, .L_blockcopy_pp_64xN_\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_64xN_sve 16
-blockcopy_pp_64xN_sve 32
-blockcopy_pp_64xN_sve 48
-blockcopy_pp_64xN_sve 64
-
function PFX(blockfill_s_32x32_sve)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 1ad371c57..d466f8ea8 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -459,292 +459,6 @@ function PFX(blockcopy_sp_32x64_neon)
ret
endfunc
-/* blockcopy_pp(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) */
-
-function PFX(blockcopy_pp_2x4_neon)
- ldrh w9, [x2]
- add x4, x1, x1
- add x14, x3, x3
- strh w9, [x0]
- ldrh w10, [x2, x3]
- add x5, x4, x1
- add x15, x14, x3
- strh w10, [x0, x1]
- ldrh w11, [x2, x14]
- strh w11, [x0, x4]
- ldrh w12, [x2, x15]
- strh w12, [x0, x5]
- ret
-endfunc
-
-.macro blockcopy_pp_2xN_neon h
-function PFX(blockcopy_pp_2x\h\()_neon)
- add x4, x1, x1
- add x5, x4, x1
- add x6, x5, x1
-
- add x14, x3, x3
- add x15, x14, x3
- add x16, x15, x3
-
-.rept \h / 4
- ldrh w9, [x2]
- strh w9, [x0]
- ldrh w10, [x2, x3]
- strh w10, [x0, x1]
- ldrh w11, [x2, x14]
- strh w11, [x0, x4]
- ldrh w12, [x2, x15]
- strh w12, [x0, x5]
- add x2, x2, x16
- add x0, x0, x6
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_2xN_neon 8
-blockcopy_pp_2xN_neon 16
-
-function PFX(blockcopy_pp_4x2_neon)
- ldr w9, [x2]
- str w9, [x0]
- ldr w10, [x2, x3]
- str w10, [x0, x1]
- ret
-endfunc
-
-function PFX(blockcopy_pp_4x4_neon)
- ldr w9, [x2]
- add x4, x1, x1
- add x14, x3, x3
- str w9, [x0]
- ldr w10, [x2, x3]
- add x5, x4, x1
- add x15, x14, x3
- str w10, [x0, x1]
- ldr w11, [x2, x14]
- str w11, [x0, x4]
- ldr w12, [x2, x15]
- str w12, [x0, x5]
- ret
-endfunc
-
-.macro blockcopy_pp_4xN_neon h
-function PFX(blockcopy_pp_4x\h\()_neon)
- add x4, x1, x1
- add x5, x4, x1
- add x6, x5, x1
-
- add x14, x3, x3
- add x15, x14, x3
- add x16, x15, x3
-
-.rept \h / 4
- ldr w9, [x2]
- str w9, [x0]
- ldr w10, [x2, x3]
- str w10, [x0, x1]
- ldr w11, [x2, x14]
- str w11, [x0, x4]
- ldr w12, [x2, x15]
- str w12, [x0, x5]
- add x2, x2, x16
- add x0, x0, x6
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_4xN_neon 8
-blockcopy_pp_4xN_neon 16
-blockcopy_pp_4xN_neon 32
-
-.macro blockcopy_pp_6xN_neon h
-function PFX(blockcopy_pp_6x\h\()_neon)
- sub x1, x1, #4
-.rept \h
- ld1 {v0.8b}, [x2], x3
- st1 {v0.s}[0], [x0], #4
- st1 {v0.h}[2], [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_6xN_neon 8
-blockcopy_pp_6xN_neon 16
-
-.macro blockcopy_pp_8xN_neon h
-function PFX(blockcopy_pp_8x\h\()_neon)
-.rept \h
- ld1 {v0.4h}, [x2], x3
- st1 {v0.4h}, [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_8xN_neon 2
-blockcopy_pp_8xN_neon 4
-blockcopy_pp_8xN_neon 6
-blockcopy_pp_8xN_neon 8
-blockcopy_pp_8xN_neon 12
-blockcopy_pp_8xN_neon 16
-blockcopy_pp_8xN_neon 32
-
-function PFX(blockcopy_pp_8x64_neon)
- mov w12, #4
-.Loop_pp_8x64:
- sub w12, w12, #1
-.rept 16
- ld1 {v0.4h}, [x2], x3
- st1 {v0.4h}, [x0], x1
-.endr
- cbnz w12, .Loop_pp_8x64
- ret
-endfunc
-
-.macro blockcopy_pp_16xN_neon h
-function PFX(blockcopy_pp_16x\h\()_neon)
-.rept \h
- ld1 {v0.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
-.endr
- ret
-endfunc
-.endm
-
-blockcopy_pp_16xN_neon 4
-blockcopy_pp_16xN_neon 8
-blockcopy_pp_16xN_neon 12
-blockcopy_pp_16xN_neon 16
-
-.macro blockcopy_pp_16xN1_neon h
-function PFX(blockcopy_pp_16x\h\()_neon)
- mov w12, #\h / 8
-.Loop_16x\h\():
-.rept 8
- ld1 {v0.8h}, [x2], x3
- st1 {v0.8h}, [x0], x1
-.endr
- sub w12, w12, #1
- cbnz w12, .Loop_16x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_16xN1_neon 24
-blockcopy_pp_16xN1_neon 32
-blockcopy_pp_16xN1_neon 64
-
-function PFX(blockcopy_pp_12x16_neon)
- sub x1, x1, #8
-.rept 16
- ld1 {v0.16b}, [x2], x3
- str d0, [x0], #8
- st1 {v0.s}[2], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_pp_12x32_neon)
- sub x1, x1, #8
- mov w12, #4
-.Loop_pp_12x32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b}, [x2], x3
- str d0, [x0], #8
- st1 {v0.s}[2], [x0], x1
-.endr
- cbnz w12, .Loop_pp_12x32
- ret
-endfunc
-
-function PFX(blockcopy_pp_24x32_neon)
- mov w12, #4
-.Loop_24x32:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.8b-v2.8b}, [x2], x3
- st1 {v0.8b-v2.8b}, [x0], x1
-.endr
- cbnz w12, .Loop_24x32
- ret
-endfunc
-
-function PFX(blockcopy_pp_24x64_neon)
- mov w12, #4
-.Loop_24x64:
- sub w12, w12, #1
-.rept 16
- ld1 {v0.8b-v2.8b}, [x2], x3
- st1 {v0.8b-v2.8b}, [x0], x1
-.endr
- cbnz w12, .Loop_24x64
- ret
-endfunc
-
-function PFX(blockcopy_pp_32x8_neon)
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- ret
-endfunc
-
-.macro blockcopy_pp_32xN_neon h
-function PFX(blockcopy_pp_32x\h\()_neon)
- mov w12, #\h / 8
-.Loop_32x\h\():
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v1.16b}, [x2], x3
- st1 {v0.16b-v1.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_32x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_32xN_neon 16
-blockcopy_pp_32xN_neon 24
-blockcopy_pp_32xN_neon 32
-blockcopy_pp_32xN_neon 64
-blockcopy_pp_32xN_neon 48
-
-function PFX(blockcopy_pp_48x64_neon)
- mov w12, #8
-.Loop_48x64:
- sub w12, w12, #1
-.rept 8
- ld1 {v0.16b-v2.16b}, [x2], x3
- st1 {v0.16b-v2.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_48x64
- ret
-endfunc
-
-.macro blockcopy_pp_64xN_neon h
-function PFX(blockcopy_pp_64x\h\()_neon)
- mov w12, #\h / 4
-.Loop_64x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v3.16b}, [x2], x3
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_64x\h
- ret
-endfunc
-.endm
-
-blockcopy_pp_64xN_neon 16
-blockcopy_pp_64xN_neon 32
-blockcopy_pp_64xN_neon 48
-blockcopy_pp_64xN_neon 64
-
// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
function PFX(blockfill_s_4x4_neon)
dup v0.4h, w2
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 4ba0ad1eb..9afd9f913 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -942,45 +942,67 @@ void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t st
}
-template<int bx, int by>
-void blockcopy_pp_neon(pixel *a, intptr_t stridea, const pixel *b, intptr_t strideb)
+template<int width, int height>
+void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
+ intptr_t src_stride)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
+ int w = 0;
#if HIGH_BIT_DEPTH
- for (; (x + 8) <= bx; x += 8)
+ for (; w + 16 <= width; w += 16)
{
- vst1q_u16(a + x, vld1q_u16(b + x));
+ uint16x8_t s0_lo = vld1q_u16(src + w);
+ uint16x8_t s0_hi = vld1q_u16(src + w + 8);
+ vst1q_u16(dst + w, s0_lo);
+ vst1q_u16(dst + w + 8, s0_hi);
}
- if (bx & 4)
+ if (width & 8)
{
- vst1_u16(a + x, vld1_u16(b + x));
- x += 4;
+ uint16x8_t s0 = vld1q_u16(src + w);
+ vst1q_u16(dst + w, s0);
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint16x4_t s0 = vld1_u16(src + w);
+ vst1_u16(dst + w, s0);
+ w += 4;
}
#else
- for (; (x + 16) <= bx; x += 16)
+ for (; w + 32 <= width; w += 32)
{
- vst1q_u8(a + x, vld1q_u8(b + x));
+ uint8x16_t s0_lo = vld1q_u8(src + w);
+ uint8x16_t s0_hi = vld1q_u8(src + w + 16);
+ vst1q_u8(dst + w, s0_lo);
+ vst1q_u8(dst + w + 16, s0_hi);
}
- if (bx & 8)
+ if (width & 16)
{
- vst1_u8(a + x, vld1_u8(b + x));
- x += 8;
+ uint8x16_t s0 = vld1q_u8(src + w);
+ vst1q_u8(dst + w, s0);
+ w += 16;
}
- if (bx & 4)
+ if (width & 8)
{
- store_u8x4x1(a + x, load_u8x4x1(b + x));
- x += 4;
+ uint8x8_t s0 = vld1_u8(src + w);
+ vst1_u8(dst + w, s0);
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint8x8_t s0 = load_u8x4x1(src + w);
+ store_u8x4x1(dst + w, s0);
+ w += 4;
}
#endif
- for (; x < bx; x++)
+ for (; w < width; w++)
{
- a[x] = b[x];
+ dst[w] = src[w];
}
- a += stridea;
- b += strideb;
+ src += src_stride;
+ dst += dst_stride;
}
}
@@ -1816,11 +1838,11 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
#endif // !(HIGH_BIT_DEPTH)
- p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
+ p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
p.cu[BLOCK_16x16].blockfill_s[ALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
+ p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
+ p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
p.cu[BLOCK_64x64].blockfill_s[ALIGNED] = blockfill_s_neon<64>;
@@ -1999,8 +2021,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
-
+
+
CHROMA_CU_S_422(4, 8)
CHROMA_CU_422(8, 16)
CHROMA_CU_422(16, 32)
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list