[x265] [PATCH ARM 2/6] AArch64: intra_pred_planar8_neon, intra_pred_planar16_neon

Thu Sep 12 13:37:49 UTC 2024

>From 1fdb5829e81aecf78665d4afdec98d784243bb9e Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Sat, 31 Aug 2024 07:50:47 -0700
Subject: [PATCH 2/6] AArch64: intra_pred_planar8_neon,
 intra_pred_planar16_neon

---
 source/common/CMakeLists.txt             |   2 +-
 source/common/aarch64/intrapred-prim.cpp |   7 +
 source/common/aarch64/intrapred.S        | 171 +++++++++++++++++++++++
 3 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 source/common/aarch64/intrapred.S

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 4b7145132..45d880110 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -111,7 +111,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     enable_language(ASM)

     # add ARM assembly/intrinsic files here
-    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S
pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S
ssd-a.S ssd-a-common.S)
+    set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S
pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S
ssd-a.S ssd-a-common.S intrapred.S)
     set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
diff --git a/source/common/aarch64/intrapred-prim.cpp
b/source/common/aarch64/intrapred-prim.cpp
index 9bf50c4aa..8624dd2a6 100644
--- a/source/common/aarch64/intrapred-prim.cpp
+++ b/source/common/aarch64/intrapred-prim.cpp
@@ -237,6 +237,8 @@ void all_angs_pred_neon(pixel *dest, pixel *refPix,
pixel *filtPix, int bLuma)
 namespace X265_NS
 {
 // x265 private namespace
+extern "C" void PFX(intra_pred_planar8_neon)(pixel* dst, intptr_t
dstStride, const pixel* srcPix, int dirMode, int bFilter);
+extern "C" void PFX(intra_pred_planar16_neon)(pixel* dst, intptr_t
dstStride, const pixel* srcPix, int dirMode, int bFilter);

 void setupIntraPrimitives_neon(EncoderPrimitives &p)
 {
@@ -256,6 +258,11 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_neon<3>;
     p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_neon<4>;
     p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
+
+#if !HIGH_BIT_DEPTH
+    p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon);
+    p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] =
PFX(intra_pred_planar16_neon);
+#endif
 }

 }
diff --git a/source/common/aarch64/intrapred.S
b/source/common/aarch64/intrapred.S
new file mode 100644
index 000000000..2f91ebfa3
--- /dev/null
+++ b/source/common/aarch64/intrapred.S
@@ -0,0 +1,171 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.align 4
+tbl_const_1to8_7to0:
+    .byte 1, 2, 3, 4, 5, 6, 7, 8
+    .byte 7, 6, 5, 4, 3, 2, 1, 0
+    .byte 9, 10, 11, 12, 13, 14, 15, 16
+    .byte 15, 14, 13, 12, 11, 10, 9, 8
+
+// ***** planar_pred *****
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix,
int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar8_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = left[x]
+// x4  = tmp
+// v0  = above[7:0]
+// v1  = left[7:0]
+// v2  = topRight = rep(above[blkSize])
+// v3  = bottomLeft = rep(left[blkSize])
+// v4  = const[8 7 6 5 4 3 2 1]
+// v5  = const[7 6 5 4 3 2 1 0]
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = above[blkSize];
+//    pixel bottomLeft = left[blkSize];
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) *
left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) *
bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, [x2, #(1+8)]                // topRight
+    ldurb           w4, [x2, #(2*8+1+8)]            // bottomLeft
+    dup             v2.8b, w3                       // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+    ldr             x3, [x2, #(2*8+1)]              // x3 = left[x]_b
+    ldr             d0, [x2, #1]                    // v0 = above[x]_b
+
+    adr             x4, tbl_const_1to8_7to0
+    ldr             d4, [x4]                        // v4 = const_b[8 7 6
5 4 3 2 1]
+    ldr             d5, [x4, #8]                    // v5 = const_b[7 6 5
4 3 2 1 0]
+
+    ushll           v6.8h, v0.8b, #3                // v6 = 8 * above[x]
+    usubw           v0.8h, v3.8h, v0.8b             // v0 = bottomLeft -
above[x]
+
+    umlal           v6.8h, v4.8b, v2.8b             // v6 = 8 * above[x] +
(x + 1) * topRight
+
+    mov             w4, #8
+
+1:
+    dup             v1.8b, w3
+    lsr             x3, x3, #8
+    add             v6.8h, v6.8h, v0.8h             // v6 = (blkSize - 1
-y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    mov             v3.16b, v6.16b
+    umlal           v3.8h, v5.8b, v1.8b             // v3 = (blkSize - 1 -
x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0
+ 1) * bottomLeft
+    rshrn           v3.8b, v3.8h, #4
+    sub             w4, w4, #1
+    st1             {v3.8b}, [x0], x1
+    cbnz            w4, 1b
+
+    ret
+endfunc
+
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix,
int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar16_neon)
+// Register map
+// x0  = dst
+// x1  = dstStride
+// x2  = *srcPix
+// x3  = left[x]
+// x4  = tmp
+// v0  = above[7:0]
+// v1  = left[7:0]
+// v2  = topRight = rep(above[blkSize])
+// v3  = bottomLeft = rep(left[blkSize])
+// v4  = const[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+// v5  = const[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+
+//{
+//    const int blkSize = 1 << log2Size;
+//    const pixel* above = srcPix + 1;
+//    const pixel* left  = srcPix + (2 * blkSize + 1);
+//    pixel topRight = above[blkSize];
+//    pixel bottomLeft = left[blkSize];
+//    for (int y = 0; y < blkSize; y++)
+//        for (int x = 0; x < blkSize; x++)
+//            dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) *
left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) *
bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+    ldurb           w3, [x2, #(1+16)]               // topRight
+    ldurb           w4, [x2, #(2*16+1+16)]          // bottomLeft
+    ldr             q0, [x2, #(2*16+1)]             // v0 = left[x]_b
+    ldr             q1, [x2, #1]                    // v1 = above[x]_b
+    dup             v2.16b, w3                      // v2 = topRight_b
+    dup             v3.8h, w4                       // v3 = bottomLeft_h
+
+    adr             x4, tbl_const_1to8_7to0
+    ld2             {v4.2d, v5.2d}, [x4]            // v4 = const_b[16 15
14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+    ext             v5.16b, v5.16b, v5.16b, #8      // v5 = const_b[15 14
13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+
+    ushll           v16.8h, v1.8b, #4               // v16,v17 = 16 *
above[x]
+    ushll2          v17.8h, v1.16b, #4
+    usubw           v6.8h, v3.8h, v1.8b             // v6,v7 = bottomLeft
- above[x]
+    usubw2          v7.8h, v3.8h, v1.16b
+
+    umlal           v16.8h, v4.8b, v2.8b            // v16,v17 = 16 *
above[x] + (x + 1) * topRight
+    umlal2          v17.8h, v4.16b, v2.16b
+
+    mov             w4, #16
+
+1:
+    dup             v1.16b, v0.b[0]                 // v1 = left[x]_b
+    ext             v0.16b, v0.16b, v0.16b, #1
+
+    add             v16.8h, v16.8h, v6.8h           // v16,v17 = (blkSize
- 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+    add             v17.8h, v17.8h, v7.8h
+
+    mov             v18.16b, v16.16b
+    mov             v19.16b, v17.16b
+
+    umlal           v18.8h, v5.8b, v1.8b             // v3 = (blkSize - 1
- x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight +
(y=0 + 1) * bottomLeft
+    umlal2          v19.8h, v5.16b, v1.16b
+    rshrn           v18.8b, v18.8h, #5
+    rshrn2          v18.16b, v19.8h, #5
+    st1             {v18.16b}, [x0], x1
+    sub             w4, w4, #1
+    cbnz            w4, 1b
+
+    ret
+endfunc
-- 
2.36.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/6efa8f05/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AARCH64-intra_pred_planar8_neon-intra_pred_planar16_.patch
Type: application/octet-stream
Size: 9612 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/6efa8f05/attachment-0001.obj>