[x265] [PATCH 1/2] AArch64: Add SBD and HBD Neon implementation of planecopy_cp
Micro Daryl Robles
microdaryl.robles at arm.com
Mon Apr 7 10:57:22 UTC 2025
Add Neon intrinsic implementation of planecopy_cp that works for both
SBD and HBD. Remove the Neon asm for SBD as the intrinsics
implementation provides better performance.
This Neon implementation is optimised such that the shift value is fixed
as 'X265_DEPTH - 8' to reflect the behaviour of the actual call to the
function.
Relative performance compared to Neon asm [SBD]:
Neoverse N1: 1.35x
Neoverse N2: 0.92x
Neoverse V1: 1.09x
Neoverse V2: 1.28x
Relative performance compared to scalar C [HBD]:
Neoverse N1: 2.90x
Neoverse N2: 3.23x
Neoverse V1: 3.46x
Neoverse V2: 3.00x
---
source/common/aarch64/asm-primitives.cpp | 3 --
source/common/aarch64/fun-decls.h | 2 -
source/common/aarch64/pixel-prim.cpp | 56 +++++++++++++++++++++++-
source/common/aarch64/pixel-util.S | 42 ------------------
source/test/pixelharness.cpp | 8 ++--
5 files changed, 60 insertions(+), 51 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..621dbf334 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -651,9 +651,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
- // planecopy
- p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
-
// satd
ALL_LUMA_PU(satd, pixel_satd, neon);
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 5fdede910..22fefb398 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -162,8 +162,6 @@ FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const
FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
-void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
-
uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 52ce522ad..4a7831428 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1087,6 +1087,60 @@ void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t
}
}
+void planecopy_cp_neon(const uint8_t *src, intptr_t srcStride, pixel *dst,
+ intptr_t dstStride, int width, int height, int shift)
+{
+ X265_CHECK(width >= 16, "width length error\n");
+ X265_CHECK(height >= 1, "height length error\n");
+ X265_CHECK(shift == X265_DEPTH - 8, "shift value error\n");
+
+ (void)shift;
+
+ do
+ {
+#if HIGH_BIT_DEPTH
+ for (int w = 0; w < width - 16; w += 16)
+ {
+ uint8x16_t in = vld1q_u8(src + w);
+ uint16x8_t t0 = vshll_n_u8(vget_low_u8(in), X265_DEPTH - 8);
+ uint16x8_t t1 = vshll_n_u8(vget_high_u8(in), X265_DEPTH - 8);
+ vst1q_u16(dst + w + 0, t0);
+ vst1q_u16(dst + w + 8, t1);
+ }
+ // Tail - src must be different from dst for this to work.
+ {
+ uint8x16_t in = vld1q_u8(src + width - 16);
+ uint16x8_t t0 = vshll_n_u8(vget_low_u8(in), X265_DEPTH - 8);
+ uint16x8_t t1 = vshll_n_u8(vget_high_u8(in), X265_DEPTH - 8);
+ vst1q_u16(dst + width - 16, t0);
+ vst1q_u16(dst + width - 8, t1);
+ }
+#else
+ int w;
+ for (w = 0; w < width - 32; w += 32)
+ {
+ uint8x16_t in0 = vld1q_u8(src + w + 0);
+ uint8x16_t in1 = vld1q_u8(src + w + 16);
+ vst1q_u8(dst + w + 0, in0);
+ vst1q_u8(dst + w + 16, in1);
+ }
+ if (w < width - 16)
+ {
+ uint8x16_t in = vld1q_u8(src + w);
+ vst1q_u8(dst + w, in);
+ }
+ // Tail - src must be different from dst for this to work.
+ {
+ uint8x16_t in = vld1q_u8(src + width - 16);
+ vst1q_u8(dst + width - 16, in);
+ }
+#endif
+ dst += dstStride;
+ src += srcStride;
+ }
+ while (--height != 0);
+}
+
template<int lx, int ly>
void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0, const pixel *src1,
intptr_t sstride1, int)
@@ -1711,7 +1765,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
-
+ p.planecopy_cp = planecopy_cp_neon;
}
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1825466ea..72f8bbc8b 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,48 +565,6 @@ function PFX(scale2D_64to32_neon)
ret
endfunc
-// void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
-function PFX(pixel_planecopy_cp_neon)
- dup v2.16b, w6
- sub x5, x5, #1
-.Loop_h:
- mov x6, x0
- mov x12, x2
- mov x7, #0
-.Loop_w:
- ldr q0, [x6], #16
- ushl v0.16b, v0.16b, v2.16b
- str q0, [x12], #16
- add x7, x7, #16
- cmp x7, x4
- blt .Loop_w
-
- add x0, x0, x1
- add x2, x2, x3
- sub x5, x5, #1
- cbnz x5, .Loop_h
-
-// handle last row
- mov x5, x4
- lsr x5, x5, #3
-.LoopW8:
- ldr d0, [x0], #8
- ushl v0.8b, v0.8b, v2.8b
- str d0, [x2], #8
- sub x4, x4, #8
- sub x5, x5, #1
- cbnz x5, .LoopW8
-
- mov x5, #8
- sub x5, x5, x4
- sub x0, x0, x5
- sub x2, x2, x5
- ldr d0, [x0]
- ushl v0.8b, v0.8b, v2.8b
- str d0, [x2]
- ret
-endfunc
-
//******* satd *******
.macro satd_4x4_neon
ldr s0, [x0]
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 10f66cda1..f46f9ae3d 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -1564,13 +1564,14 @@ bool PixelHarness::check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt)
int height = 16 + rand() % 48;
intptr_t srcStride = 64;
intptr_t dstStride = width;
+ int shift = X265_DEPTH - 8;
int j = 0;
for (int i = 0; i < ITERS; i++)
{
int index = i % TEST_CASES;
- checked(opt, uchar_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)2);
- ref(uchar_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)2);
+ checked(opt, uchar_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, shift);
+ ref(uchar_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, shift);
if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
return false;
@@ -3665,7 +3666,8 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
if (opt.planecopy_cp)
{
HEADER0("planecopy_cp");
- REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
+ REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1,
+ 64, 64, 64, X265_DEPTH - 8);
}
if (opt.propagateCost)
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-SBD-and-HBD-Neon-implementation-of-plane.patch
Type: text/x-diff
Size: 8491 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250407/f67eace3/attachment.patch>
More information about the x265-devel
mailing list