[x265] [PATCH 1/2] AArch64: Add SBD and HBD Neon implementation of planecopy_cp

Mon Apr 7 10:57:22 UTC 2025

Add Neon intrinsic implementation of planecopy_cp that works for both
SBD and HBD. Remove the Neon asm for SBD as the intrinsics
implementation provides better performance.

This Neon implementation is optimised such that the shift value is fixed
as 'X265_DEPTH - 8' to reflect the behaviour of the actual call to the
function.

Relative performance compared to Neon asm [SBD]:
 Neoverse N1: 1.35x
 Neoverse N2: 0.92x
 Neoverse V1: 1.09x
 Neoverse V2: 1.28x

Relative performance compared to scalar C [HBD]:
 Neoverse N1: 2.90x
 Neoverse N2: 3.23x
 Neoverse V1: 3.46x
 Neoverse V2: 3.00x
---
 source/common/aarch64/asm-primitives.cpp |  3 --
 source/common/aarch64/fun-decls.h        |  2 -
 source/common/aarch64/pixel-prim.cpp     | 56 +++++++++++++++++++++++-
 source/common/aarch64/pixel-util.S       | 42 ------------------
 source/test/pixelharness.cpp             |  8 ++--
 5 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index c1317eb74..621dbf334 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -651,9 +651,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
     p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
 
-    // planecopy
-    p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
-
     // satd
     ALL_LUMA_PU(satd, pixel_satd, neon);
 
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 5fdede910..22fefb398 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -162,8 +162,6 @@ FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const
 FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
 FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
 
-void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
-
 uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
 uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
 uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 52ce522ad..4a7831428 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1087,6 +1087,60 @@ void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t
     }
 }
 
+void planecopy_cp_neon(const uint8_t *src, intptr_t srcStride, pixel *dst,
+                       intptr_t dstStride, int width, int height, int shift)
+{
+    X265_CHECK(width >= 16, "width length error\n");
+    X265_CHECK(height >= 1, "height length error\n");
+    X265_CHECK(shift == X265_DEPTH - 8, "shift value error\n");
+
+    (void)shift;
+
+    do
+    {
+#if HIGH_BIT_DEPTH
+        for (int w = 0; w < width - 16; w += 16)
+        {
+            uint8x16_t in = vld1q_u8(src + w);
+            uint16x8_t t0 = vshll_n_u8(vget_low_u8(in), X265_DEPTH - 8);
+            uint16x8_t t1 = vshll_n_u8(vget_high_u8(in), X265_DEPTH - 8);
+            vst1q_u16(dst + w + 0, t0);
+            vst1q_u16(dst + w + 8, t1);
+        }
+        // Tail - src must be different from dst for this to work.
+        {
+            uint8x16_t in = vld1q_u8(src + width - 16);
+            uint16x8_t t0 = vshll_n_u8(vget_low_u8(in), X265_DEPTH - 8);
+            uint16x8_t t1 = vshll_n_u8(vget_high_u8(in), X265_DEPTH - 8);
+            vst1q_u16(dst + width - 16, t0);
+            vst1q_u16(dst + width - 8, t1);
+        }
+#else
+        int w;
+        for (w = 0; w < width - 32; w += 32)
+        {
+            uint8x16_t in0 = vld1q_u8(src + w + 0);
+            uint8x16_t in1 = vld1q_u8(src + w + 16);
+            vst1q_u8(dst + w + 0, in0);
+            vst1q_u8(dst + w + 16, in1);
+        }
+        if (w < width - 16)
+        {
+            uint8x16_t in = vld1q_u8(src + w);
+            vst1q_u8(dst + w, in);
+        }
+        // Tail - src must be different from dst for this to work.
+        {
+            uint8x16_t in = vld1q_u8(src + width - 16);
+            vst1q_u8(dst + width - 16, in);
+        }
+#endif
+        dst += dstStride;
+        src += srcStride;
+    }
+    while (--height != 0);
+}
+
 template<int lx, int ly>
 void pixelavg_pp_neon(pixel *dst, intptr_t dstride, const pixel *src0, intptr_t sstride0, const pixel *src1,
                       intptr_t sstride1, int)
@@ -1711,7 +1765,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.chroma[X265_CSP_I422].cu[BLOCK_32x32].sa8d = sa8d16<16, 32>;
     p.chroma[X265_CSP_I422].cu[BLOCK_64x64].sa8d = sa8d16<32, 64>;
 
-
+    p.planecopy_cp = planecopy_cp_neon;
 }
 
 
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 1825466ea..72f8bbc8b 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -565,48 +565,6 @@ function PFX(scale2D_64to32_neon)
     ret
 endfunc
 
-// void planecopy_cp_c(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift)
-function PFX(pixel_planecopy_cp_neon)
-    dup             v2.16b, w6
-    sub             x5, x5, #1
-.Loop_h:
-    mov             x6, x0
-    mov             x12, x2
-    mov             x7, #0
-.Loop_w:
-    ldr             q0, [x6], #16
-    ushl            v0.16b, v0.16b, v2.16b
-    str             q0, [x12], #16
-    add             x7, x7, #16
-    cmp             x7, x4
-    blt             .Loop_w
-
-    add             x0, x0, x1
-    add             x2, x2, x3
-    sub             x5, x5, #1
-    cbnz            x5, .Loop_h
-
-// handle last row
-    mov             x5, x4
-    lsr             x5, x5, #3
-.LoopW8:
-    ldr             d0, [x0], #8
-    ushl            v0.8b, v0.8b, v2.8b
-    str             d0, [x2], #8
-    sub             x4, x4, #8
-    sub             x5, x5, #1
-    cbnz            x5, .LoopW8
-
-    mov             x5, #8
-    sub             x5, x5, x4
-    sub             x0, x0, x5
-    sub             x2, x2, x5
-    ldr             d0, [x0]
-    ushl            v0.8b, v0.8b, v2.8b
-    str             d0, [x2]
-    ret
-endfunc
-
 //******* satd *******
 .macro satd_4x4_neon
     ldr             s0, [x0]
diff --git a/source/test/pixelharness.cpp b/source/test/pixelharness.cpp
index 10f66cda1..f46f9ae3d 100644
--- a/source/test/pixelharness.cpp
+++ b/source/test/pixelharness.cpp
@@ -1564,13 +1564,14 @@ bool PixelHarness::check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt)
     int height = 16 + rand() % 48;
     intptr_t srcStride = 64;
     intptr_t dstStride = width;
+    int shift = X265_DEPTH - 8;
     int j = 0;
 
     for (int i = 0; i < ITERS; i++)
     {
         int index = i % TEST_CASES;
-        checked(opt, uchar_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, (int)2);
-        ref(uchar_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, (int)2);
+        checked(opt, uchar_test_buff[index] + j, srcStride, opt_dest, dstStride, width, height, shift);
+        ref(uchar_test_buff[index] + j, srcStride, ref_dest, dstStride, width, height, shift);
 
         if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
@@ -3665,7 +3666,8 @@ void PixelHarness::measureSpeed(const EncoderPrimitives& ref, const EncoderPrimi
     if (opt.planecopy_cp)
     {
         HEADER0("planecopy_cp");
-        REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
+        REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1,
+                       64, 64, 64, X265_DEPTH - 8);
     }
 
     if (opt.propagateCost)
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-SBD-and-HBD-Neon-implementation-of-plane.patch
Type: text/x-diff
Size: 8491 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250407/f67eace3/attachment.patch>