[x265] [PATCH ARM 2/6] AArch64: intra_pred_planar8_neon, intra_pred_planar16_neon
Pavan Tarun Chakka Venkata
pavan.tarun at multicorewareinc.com
Thu Sep 12 13:37:49 UTC 2024
>From 1fdb5829e81aecf78665d4afdec98d784243bb9e Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Sat, 31 Aug 2024 07:50:47 -0700
Subject: [PATCH 2/6] AArch64: intra_pred_planar8_neon,
intra_pred_planar16_neon
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/intrapred-prim.cpp | 7 +
source/common/aarch64/intrapred.S | 171 +++++++++++++++++++++++
3 files changed, 179 insertions(+), 1 deletion(-)
create mode 100644 source/common/aarch64/intrapred.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 4b7145132..45d880110 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -111,7 +111,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
enable_language(ASM)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S
pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S
ssd-a.S ssd-a-common.S)
+ set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S
pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S
ssd-a.S ssd-a-common.S intrapred.S)
set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
diff --git a/source/common/aarch64/intrapred-prim.cpp
b/source/common/aarch64/intrapred-prim.cpp
index 9bf50c4aa..8624dd2a6 100644
--- a/source/common/aarch64/intrapred-prim.cpp
+++ b/source/common/aarch64/intrapred-prim.cpp
@@ -237,6 +237,8 @@ void all_angs_pred_neon(pixel *dest, pixel *refPix,
pixel *filtPix, int bLuma)
namespace X265_NS
{
// x265 private namespace
+extern "C" void PFX(intra_pred_planar8_neon)(pixel* dst, intptr_t
dstStride, const pixel* srcPix, int dirMode, int bFilter);
+extern "C" void PFX(intra_pred_planar16_neon)(pixel* dst, intptr_t
dstStride, const pixel* srcPix, int dirMode, int bFilter);
void setupIntraPrimitives_neon(EncoderPrimitives &p)
{
@@ -256,6 +258,11 @@ void setupIntraPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_8x8].intra_pred_allangs = all_angs_pred_neon<3>;
p.cu[BLOCK_16x16].intra_pred_allangs = all_angs_pred_neon<4>;
p.cu[BLOCK_32x32].intra_pred_allangs = all_angs_pred_neon<5>;
+
+#if !HIGH_BIT_DEPTH
+ p.cu[BLOCK_8x8].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar8_neon);
+ p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] =
PFX(intra_pred_planar16_neon);
+#endif
}
}
diff --git a/source/common/aarch64/intrapred.S
b/source/common/aarch64/intrapred.S
new file mode 100644
index 000000000..2f91ebfa3
--- /dev/null
+++ b/source/common/aarch64/intrapred.S
@@ -0,0 +1,171 @@
+/*****************************************************************************
+ * Copyright (C) 2021 MulticoreWare, Inc
+ *
+ * Authors: Min Chen <min.chen at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+
*****************************************************************************/
+
+// Functions in this file:
+// ***** luma_vpp *****
+
+#include "asm.S"
+
+#ifdef __APPLE__
+.section __RODATA,__rodata
+#else
+.section .rodata
+#endif
+
+.align 4
+
+.text
+
+.align 4
+tbl_const_1to8_7to0:
+ .byte 1, 2, 3, 4, 5, 6, 7, 8
+ .byte 7, 6, 5, 4, 3, 2, 1, 0
+ .byte 9, 10, 11, 12, 13, 14, 15, 16
+ .byte 15, 14, 13, 12, 11, 10, 9, 8
+
+// ***** planar_pred *****
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix,
int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar8_neon)
+// Register map
+// x0 = dst
+// x1 = dstStride
+// x2 = *srcPix
+// x3 = left[x]
+// x4 = tmp
+// v0 = above[7:0]
+// v1 = left[7:0]
+// v2 = topRight = rep(above[blkSize])
+// v3 = bottomLeft = rep(left[blkSize])
+// v4 = const[8 7 6 5 4 3 2 1]
+// v5 = const[7 6 5 4 3 2 1 0]
+
+//{
+// const int blkSize = 1 << log2Size;
+// const pixel* above = srcPix + 1;
+// const pixel* left = srcPix + (2 * blkSize + 1);
+// pixel topRight = above[blkSize];
+// pixel bottomLeft = left[blkSize];
+// for (int y = 0; y < blkSize; y++)
+// for (int x = 0; x < blkSize; x++)
+// dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) *
left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) *
bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+ ldurb w3, [x2, #(1+8)] // topRight
+ ldurb w4, [x2, #(2*8+1+8)] // bottomLeft
+ dup v2.8b, w3 // v2 = topRight_b
+ dup v3.8h, w4 // v3 = bottomLeft_h
+ ldr x3, [x2, #(2*8+1)] // x3 = left[x]_b
+ ldr d0, [x2, #1] // v0 = above[x]_b
+
+ adr x4, tbl_const_1to8_7to0
+ ldr d4, [x4] // v4 = const_b[8 7 6
5 4 3 2 1]
+ ldr d5, [x4, #8] // v5 = const_b[7 6 5
4 3 2 1 0]
+
+ ushll v6.8h, v0.8b, #3 // v6 = 8 * above[x]
+ usubw v0.8h, v3.8h, v0.8b // v0 = bottomLeft -
above[x]
+
+ umlal v6.8h, v4.8b, v2.8b // v6 = 8 * above[x] +
(x + 1) * topRight
+
+ mov w4, #8
+
+1:
+ dup v1.8b, w3
+ lsr x3, x3, #8
+ add v6.8h, v6.8h, v0.8h // v6 = (blkSize - 1
-y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+ mov v3.16b, v6.16b
+ umlal v3.8h, v5.8b, v1.8b // v3 = (blkSize - 1 -
x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight + (y=0
+ 1) * bottomLeft
+ rshrn v3.8b, v3.8h, #4
+ sub w4, w4, #1
+ st1 {v3.8b}, [x0], x1
+ cbnz w4, 1b
+
+ ret
+endfunc
+
+// void planar_pred(pixel* dst, intptr_t dstStride, const pixel* srcPix,
int /*dirMode*/, int /*bFilter*/)
+function PFX(intra_pred_planar16_neon)
+// Register map
+// x0 = dst
+// x1 = dstStride
+// x2 = *srcPix
+// x3 = left[x]
+// x4 = tmp
+// v0 = above[7:0]
+// v1 = left[7:0]
+// v2 = topRight = rep(above[blkSize])
+// v3 = bottomLeft = rep(left[blkSize])
+// v4 = const[16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+// v5 = const[15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+
+//{
+// const int blkSize = 1 << log2Size;
+// const pixel* above = srcPix + 1;
+// const pixel* left = srcPix + (2 * blkSize + 1);
+// pixel topRight = above[blkSize];
+// pixel bottomLeft = left[blkSize];
+// for (int y = 0; y < blkSize; y++)
+// for (int x = 0; x < blkSize; x++)
+// dst[y * dstStride + x] = (pixel) (((blkSize - 1 - x) *
left[y] + (blkSize - 1 -y) * above[x] + (x + 1) * topRight + (y + 1) *
bottomLeft + blkSize) >> (log2Size + 1));
+//}
+
+ ldurb w3, [x2, #(1+16)] // topRight
+ ldurb w4, [x2, #(2*16+1+16)] // bottomLeft
+ ldr q0, [x2, #(2*16+1)] // v0 = left[x]_b
+ ldr q1, [x2, #1] // v1 = above[x]_b
+ dup v2.16b, w3 // v2 = topRight_b
+ dup v3.8h, w4 // v3 = bottomLeft_h
+
+ adr x4, tbl_const_1to8_7to0
+ ld2 {v4.2d, v5.2d}, [x4] // v4 = const_b[16 15
14 13 12 11 10 9 8 7 6 5 4 3 2 1]
+ ext v5.16b, v5.16b, v5.16b, #8 // v5 = const_b[15 14
13 12 11 10 9 8 7 6 5 4 3 2 1 0]
+
+ ushll v16.8h, v1.8b, #4 // v16,v17 = 16 *
above[x]
+ ushll2 v17.8h, v1.16b, #4
+ usubw v6.8h, v3.8h, v1.8b // v6,v7 = bottomLeft
- above[x]
+ usubw2 v7.8h, v3.8h, v1.16b
+
+ umlal v16.8h, v4.8b, v2.8b // v16,v17 = 16 *
above[x] + (x + 1) * topRight
+ umlal2 v17.8h, v4.16b, v2.16b
+
+ mov w4, #16
+
+1:
+ dup v1.16b, v0.b[0] // v1 = left[x]_b
+ ext v0.16b, v0.16b, v0.16b, #1
+
+ add v16.8h, v16.8h, v6.8h // v16,v17 = (blkSize
- 1 -y=0) * above[x] + (x + 1) * topRight + (y=0 + 1) * bottomLeft
+ add v17.8h, v17.8h, v7.8h
+
+ mov v18.16b, v16.16b
+ mov v19.16b, v17.16b
+
+ umlal v18.8h, v5.8b, v1.8b // v3 = (blkSize - 1
- x) * left[y=0] + (blkSize - 1 -y=0) * above[x] + (x + 1) * topRight +
(y=0 + 1) * bottomLeft
+ umlal2 v19.8h, v5.16b, v1.16b
+ rshrn v18.8b, v18.8h, #5
+ rshrn2 v18.16b, v19.8h, #5
+ st1 {v18.16b}, [x0], x1
+ sub w4, w4, #1
+ cbnz w4, 1b
+
+ ret
+endfunc
--
2.36.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/6efa8f05/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AARCH64-intra_pred_planar8_neon-intra_pred_planar16_.patch
Type: application/octet-stream
Size: 9612 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/6efa8f05/attachment-0001.obj>
More information about the x265-devel
mailing list