[x265] [PATCH v3 4/5] Unroll C implementation of pelFilterChroma_V
Micro Daryl Robles
microdaryl.robles at arm.com
Tue Feb 18 14:10:22 UTC 2025
Unrolling provides 9-14% uplift on Arm Neoverse server platforms.
---
source/common/loopfilter.cpp | 49 +++++++++++++++++++++++++++++++++++-
1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp
index f4cd65389..9e3fe5aea 100644
--- a/source/common/loopfilter.cpp
+++ b/source/common/loopfilter.cpp
@@ -28,6 +28,8 @@
#define PIXEL_MIN 0
+using namespace X265_NS;
+
namespace {
static void calSign(int8_t *dst, const pixel *src1, const pixel *src2, const int endX)
@@ -172,6 +174,51 @@ static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int
src[0] = x265_clip(m4 - (delta & maskQ));
}
}
+
+void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
+ int32_t maskP, int32_t maskQ)
+{
+ X265_CHECK(offset == 1, "Offset value must be 1 for Chroma Vertical\n");
+
+ (void)offset;
+
+ int16_t m2 = (int16_t)src[0 * srcStep - 2];
+ int16_t m3 = (int16_t)src[0 * srcStep - 1];
+ int16_t m4 = (int16_t)src[0 * srcStep + 0];
+ int16_t m5 = (int16_t)src[0 * srcStep + 1];
+
+ int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[0 * srcStep - 1] = x265_clip(m3 + (delta & maskP));
+ src[0 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[1 * srcStep - 2];
+ m3 = (int16_t)src[1 * srcStep - 1];
+ m4 = (int16_t)src[1 * srcStep + 0];
+ m5 = (int16_t)src[1 * srcStep + 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[1 * srcStep - 1] = x265_clip(m3 + (delta & maskP));
+ src[1 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[2 * srcStep - 2];
+ m3 = (int16_t)src[2 * srcStep - 1];
+ m4 = (int16_t)src[2 * srcStep + 0];
+ m5 = (int16_t)src[2 * srcStep + 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[2 * srcStep - 1] = x265_clip(m3 + (delta & maskP));
+ src[2 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[3 * srcStep - 2];
+ m3 = (int16_t)src[3 * srcStep - 1];
+ m4 = (int16_t)src[3 * srcStep + 0];
+ m5 = (int16_t)src[3 * srcStep + 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[3 * srcStep - 1] = x265_clip(m3 + (delta & maskP));
+ src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
+}
+
}
namespace X265_NS {
@@ -190,7 +237,7 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
// C code is same for EDGE_VER and EDGE_HOR only asm code is different
p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
- p.pelFilterChroma[0] = pelFilterChroma_c;
+ p.pelFilterChroma[0] = pelFilterChroma_V_c;
p.pelFilterChroma[1] = pelFilterChroma_c;
}
}
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v3-0004-Unroll-C-implementation-of-pelFilterChroma_V.patch
Type: text/x-diff
Size: 3355 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250218/467d9805/attachment.patch>
More information about the x265-devel
mailing list