[x265] [PATCH] AArch64: Add 8bit and 10bit neon intrinsics for intraFilter and intrapred DC
chen
chenm003 at 163.com
Sat Sep 14 01:46:43 UTC 2024
Hi Harshitha,
Thank for the patch, inline comment.
Regards,
Chen
At 2024-09-13 17:37:03, "Karam Singh" <karam.singh at multicorewareinc.com> wrote:
From e5ad11568ec5c7b5a9d624d8f9d8f5810f14f546 Mon Sep 17 00:00:00 2001
From: Harshitha Suresh <harshitha at multicorewareinc.com>
Date: Fri, 13 Sep 2024 13:54:08 +0530
Subject: [PATCH] AArch64: Add 8bit and 10bit neon intrinsics for intraFilter
and intrapred DC
---
source/common/aarch64/intrapred-prim.cpp | 335 ++++++++++++++++++++++-
1 file changed, 321 insertions(+), 14 deletions(-)
diff --git a/source/common/aarch64/intrapred-prim.cpp b/source/common/aarch64/intrapred-prim.cpp
index 8624dd2a6..68a9f26ac 100644
--- a/source/common/aarch64/intrapred-prim.cpp
+++ b/source/common/aarch64/intrapred-prim.cpp
@@ -2,7 +2,7 @@
#include "primitives.h"
-#if 1
+#if HAVE_NEON
#include "arm64-utils.h"
#include <arm_neon.h>
@@ -12,6 +12,52 @@ namespace
{
+template<int tuSize>
+void intraFilter_neon(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+{
+ const int tuSize2 = tuSize << 1;
+ pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
+
+ uint16x8_t two_vec = vdupq_n_u16(2);
+#if !HIGH_BIT_DEPTH
+ {
+ for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+ {
+ uint16x8_t sample1 = vmovl_u8(vld1_u8(&samples[i]));
+ uint16x8_t sample2 = vmovl_u8(vld1_u8(&samples[i-1]));
+ uint16x8_t sample3 = vmovl_u8(vld1_u8(&samples[i+1]));
+
+ uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
MC: ADDL, SHLL, MULL may reduce vmovl
+ uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+ uint16x8_t result3 = vaddq_u16(result1,result2);
+ vst1_u8(&filtered[i] , vmovn_u16(vshrq_n_u16(result3, 2)));
MC: VQSHRN, VQSHRUN may reduce '+2' and 'vmovn'
+ }
+ }
+#else
+ {
+ for(int i = 0; i < tuSize2 + tuSize2; i+=8)
+ {
+ uint16x8_t sample1 = vld1q_u16(&samples[i]);
+ uint16x8_t sample2 = vld1q_u16(&samples[i-1]);
+ uint16x8_t sample3 = vld1q_u16(&samples[i+1]);
+
+ uint16x8_t result1 = vaddq_u16(vshlq_n_u16(sample1,1), sample2 );
+ uint16x8_t result2 = vaddq_u16(sample3, two_vec);
+ uint16x8_t result3 = vaddq_u16(result1,result2);
+ vst1q_u16(&filtered[i] , vshrq_n_u16(result3, 2));
MC: vrshrq_n_u16 may reduce '+2'
+ }
+ }
+#endif
+ // filtering top
+ filtered[tuSize2] = topLast;
+
+ // filtering top-left
+ filtered[0] = ((topLeft << 1) + samples[1] + samples[tuSize2 + 1] + 2) >> 2;
+
+ // filtering left
+ filtered[tuSize2 + 1] = ((samples[tuSize2 + 1] << 1) + topLeft + samples[tuSize2 + 2] + 2) >> 2;
+ filtered[tuSize2 + tuSize2] = leftLast;
+}
template<int width>
void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, int dirMode, int bFilter)
@@ -188,6 +234,7 @@ void intra_pred_ang_neon(pixel *dst, intptr_t dstStride, const pixel *srcPix0, i
}
}
+#endif
template<int log2Size>
void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
{
@@ -232,6 +279,270 @@ void all_angs_pred_neon(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
}
}
}
+
+template<int log2Size>
+void planar_pred_neon(pixel * dst, intptr_t dstStride, const pixel * srcPix, int /*dirMode*/, int /*bFilter*/)
MC: we have assembly version for reference, this function need change algorithm to get more performance
+{
+ const int blkSize = 1 << log2Size;
+
+ const pixel* above = srcPix + 1;
+ const pixel* left = srcPix + (2 * blkSize + 1);
+
+ switch (blkSize) {
+ case 8:
+ {
+ const uint16_t log2SizePlusOne = log2Size + 1;
+ uint16x8_t blkSizeVec = vdupq_n_u16(blkSize);
+ uint16x8_t topRight = vdupq_n_u16(above[blkSize]);
+ uint16_t bottomLeft = left[blkSize];
+ uint16x8_t oneVec = vdupq_n_u16(1);
+ uint16x8_t blkSizeSubOneVec = vdupq_n_u16(blkSize - 1);
+
+ for (int y = 0; y < blkSize; y++) {
+ // (blkSize - 1 - y)
+ uint16x8_t vlkSizeYVec = vdupq_n_u16(blkSize - 1 - y);
+ // (y + 1) * bottomLeft
+ uint16x8_t bottomLeftYVec = vdupq_n_u16((y + 1) * bottomLeft);
+ // left[y]
+ uint16x8_t leftYVec = vdupq_n_u16(left[y]);
+
+ for (int x = 0; x < blkSize; x += 8) {
+ int idx = y * dstStride + x;
+ uint16x8_t xvec = { (uint16_t)(x + 0), (uint16_t)(x + 1),
+ (uint16_t)(x + 2), (uint16_t)(x + 3),
+ (uint16_t)(x + 4), (uint16_t)(x + 5),
+ (uint16_t)(x + 6), (uint16_t)(x + 7) };
MC: above style is very slow and not compatible with different compiler
+
+ // (blkSize - 1 - y) * above[x]
+ uint16x8_t aboveVec = { (uint16_t)(above[x + 0]),
+ (uint16_t)(above[x + 1]),
+ (uint16_t)(above[x + 2]),
+ (uint16_t)(above[x + 3]),
+ (uint16_t)(above[x + 4]),
+ (uint16_t)(above[x + 5]),
+ (uint16_t)(above[x + 6]),
+ (uint16_t)(above[x + 7]) };
+
+ aboveVec = vmulq_u16(aboveVec, vlkSizeYVec);
+
+ // (blkSize - 1 - x) * left[y]
+ uint16x8_t first = vsubq_u16(blkSizeSubOneVec, xvec);
+ first = vmulq_u16(first, leftYVec);
+
+ // (x + 1) * topRight
+ uint16x8_t second = vaddq_u16(xvec, oneVec);
+ second = vmulq_u16(second, topRight);
+
+ uint16x8_t resVec = vaddq_u16(first, second);
+ resVec = vaddq_u16(resVec, aboveVec);
+ resVec = vaddq_u16(resVec, bottomLeftYVec);
+ resVec = vaddq_u16(resVec, blkSizeVec);
+ resVec = vshrq_n_u16(resVec, log2SizePlusOne);
+
+ for (int i = 0; i < 8; i++)
+ dst[idx + i] = (pixel)resVec[i];
+ }
+}
+ }
+ break;
+ case 4:
+ case 32:
+ case 16:
+ {
+ const uint32_t log2SizePlusOne = log2Size + 1;
+ uint32x4_t blkSizeVec = vdupq_n_u32(blkSize);
+ uint32x4_t topRight = vdupq_n_u32(above[blkSize]);
+ uint32_t bottomLeft = left[blkSize];
+ uint32x4_t oneVec = vdupq_n_u32(1);
+ uint32x4_t blkSizeSubOneVec = vdupq_n_u32(blkSize - 1);
+
+ for (int y = 0; y < blkSize; y++) {
+ // (blkSize - 1 - y)
+ uint32x4_t vlkSizeYVec = vdupq_n_u32(blkSize - 1 - y);
+ // (y + 1) * bottomLeft
+ uint32x4_t bottomLeftYVec = vdupq_n_u32((y + 1) * bottomLeft);
MC: We are not need Multiplier every loop iteration, it may replace by adder
+ // left[y]
+ uint32x4_t leftYVec = vdupq_n_u32(left[y]);
+
+ for (int x = 0; x < blkSize; x += 4) {
MC: not good parallel performance if processing based on 4-pixel
+ int idx = y * dstStride + x;
+ uint32x4_t xvec = { (uint32_t)(x + 0), (uint32_t)(x + 1),
+ (uint32_t)(x + 2), (uint32_t)(x + 3) };
+
+ // (blkSize - 1 - y) * above[x]
+ uint32x4_t aboveVec = { (uint32_t)(above[x + 0]),
+ (uint32_t)(above[x + 1]),
+ (uint32_t)(above[x + 2]),
+ (uint32_t)(above[x + 3]) };
+ aboveVec = vmulq_u32(aboveVec, vlkSizeYVec);
+
+ // (blkSize - 1 - x) * left[y]
+ uint32x4_t first = vsubq_u32(blkSizeSubOneVec, xvec);
+ first = vmulq_u32(first, leftYVec);
+
+ // (x + 1) * topRight
+ uint32x4_t second = vaddq_u32(xvec, oneVec);
+ second = vmulq_u32(second, topRight);
+
+ uint32x4_t resVec = vaddq_u32(first, second);
+ resVec = vaddq_u32(resVec, aboveVec);
+ resVec = vaddq_u32(resVec, bottomLeftYVec);
+ resVec = vaddq_u32(resVec, blkSizeVec);
+ resVec = vshrq_n_u32(resVec, log2SizePlusOne);
+
+ for (int i = 0; i < 4; i++)
+ dst[idx + i] = (pixel)resVec[i];
MC: Why not store through Vector unit? and the result is 32-bits, dst[] is pixel, need saturation
+ }
+ }
+ }
+ break;
+ }
+}
+
+static void dcPredFilter(const pixel* above, const pixel* left, pixel* dst, intptr_t dststride, int size)
+{
+ // boundary pixels processing
+ pixel topLeft = (pixel)((above[0] + left[0] + 2 * dst[0] + 2) >> 2);
+ pixel * pdst = dst;
+
+ switch (size) {
+ case 32:
+ case 16:
+ case 8:
+ {
+ uint16x8_t vconst_3 = vdupq_n_u16(3);
+ uint16x8_t vconst_2 = vdupq_n_u16(2);
+ for (int x = 0; x < size; x += 8) {
+ uint16x8_t vabo = { (uint16_t)(above[x + 0]),
+ (uint16_t)(above[x + 1]),
+ (uint16_t)(above[x + 2]),
+ (uint16_t)(above[x + 3]),
+ (uint16_t)(above[x + 4]),
+ (uint16_t)(above[x + 5]),
+ (uint16_t)(above[x + 6]),
+ (uint16_t)(above[x + 7]) };
+
+ uint16x8_t vdst = { (uint16_t)(dst[x + 0]),
+ (uint16_t)(dst[x + 1]),
+ (uint16_t)(dst[x + 2]),
+ (uint16_t)(dst[x + 3]),
+ (uint16_t)(dst[x + 4]),
+ (uint16_t)(dst[x + 5]),
+ (uint16_t)(dst[x + 6]),
+ (uint16_t)(dst[x + 7]) };
+ // dst[x] = (pixel)((above[x] + 3 * dst[x] + 2) >> 2);
+ vdst = vmulq_u16(vdst, vconst_3);
+ vdst = vaddq_u16(vdst, vabo);
+ vdst = vaddq_u16(vdst, vconst_2);
+ vdst = vshrq_n_u16(vdst, 2);
MC: uqrshrun
...
--
2.36.0.windows.1
__________________________
Karam Singh
Ph.D. IIT Guwahati
Senior Software (Video Coding) Engineer
Mobile: +91 8011279030
Block 9A, 6th floor, DLF Cyber City
Manapakkam, Chennai 600 089
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240914/54c89b89/attachment-0001.htm>
More information about the x265-devel
mailing list