[x265] [PATCH 1/2] AArch64: Add Neon implementation of 4x4 intra_pred_planar

Thu Jan 23 14:15:30 UTC 2025

Relative performance compared to scalar C:

 Neoverse N1: 2.65x
 Neoverse N2: 2.27x
 Neoverse V1: 2.67x
 Neoverse V2: 2.82x
---
 source/common/aarch64/intrapred-prim.cpp | 47 ++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp
index 68a9f26ac..c5d47fe0d 100644
--- a/source/common/aarch64/intrapred-prim.cpp
+++ b/source/common/aarch64/intrapred-prim.cpp
@@ -4,6 +4,7 @@
 
 #if HAVE_NEON
 #include "arm64-utils.h"
+#include "mem-neon.h"
 #include <arm_neon.h>
 
 using namespace X265_NS;
@@ -399,6 +400,51 @@ void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int
         }
 }
 
+#if !HIGH_BIT_DEPTH
+void intra_pred_planar4_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix,
+                             int /*dirMode*/, int /*bFilter*/)
+{
+    const int log2Size = 2;
+    const int blkSize = 1 << log2Size;
+
+    uint8x16_t src = vld1q_u8(srcPix + 1);
+
+    uint8x8_t above =
+        vreinterpret_u8_u32(vdup_laneq_u32(vreinterpretq_u32_u8(src), 0));
+
+    uint8x8_t topRight = vdup_laneq_u8(src, blkSize);
+    uint8x8_t bottomLeft = vdup_laneq_u8(src, 3 * blkSize);
+
+    const uint8_t c[2][16] =
+    {
+        {3, 2, 1, 0, 3, 2, 1, 0, 1, 2, 3, 4, 1, 2, 3, 4},
+        {3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3}
+    };
+
+    const uint8x16_t c0 = vld1q_u8(c[0]);
+    const uint8x16_t c1 = vld1q_u8(c[1]);
+
+    uint16x8_t t = vmull_u8(topRight, vget_high_u8(c0));
+    t = vmlal_u8(t, above, vget_low_u8(c1));
+    t = vmlal_u8(t, bottomLeft, vget_high_u8(c1));
+
+    uint8x8_t index02 = vcreate_u8(0x0A0A0A0A08080808);
+    uint8x8_t left02 = vqtbl1_u8(src, index02);
+    uint16x8_t t02 = vmlal_u8(t, left02, vget_low_u8(c0));
+    uint8x8_t d02 = vrshrn_n_u16(t02, log2Size + 1);
+
+    uint8x8_t index13 = vcreate_u8(0x0B0B0B0B09090909);
+    uint8x8_t left13 = vqtbl1_u8(src, index13);
+    uint16x8_t t13 = vmlal_u8(t, left13, vget_low_u8(c0));
+    uint16x8_t sub_bottomLeft_above = vsubl_u8(bottomLeft, above);
+    t13 = vaddq_u16(t13, sub_bottomLeft_above);
+    uint8x8_t d13 = vrshrn_n_u16(t13, log2Size + 1);
+
+    store_u8x4_strided_xN<2>(dst + 0 * dstStride, 2 * dstStride, &d02);
+    store_u8x4_strided_xN<2>(dst + 1 * dstStride, 2 * dstStride, &d13);
+}
+#endif
+
 static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
 {
     // boundary pixels processing
@@ -576,6 +622,7 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
 
 #if !HIGH_BIT_DEPTH
+    p.cu[BLOCK_4x4].intra_pred[PLANAR_IDX] = intra_pred_planar4_neon;
     p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon);
     p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_neon);
 #endif
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Add-Neon-implementation-of-4x4-intra_pred_pl.patch
Type: text/x-diff
Size: 3364 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250123/3e0c9971/attachment.patch>