[x265] [PATCH v3 5/5] Unroll C implementation of pelFilterChroma_H
Micro Daryl Robles
microdaryl.robles at arm.com
Tue Feb 18 14:10:28 UTC 2025
Unrolling improves performance by 7-9% on Arm Neoverse server platforms.
---
source/common/loopfilter.cpp | 67 ++++++++++++++++++++++++------------
1 file changed, 45 insertions(+), 22 deletions(-)
diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp
index 9e3fe5aea..d378fa9b2 100644
--- a/source/common/loopfilter.cpp
+++ b/source/common/loopfilter.cpp
@@ -154,27 +154,6 @@ static void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset,
}
}
-/* Deblocking of one line/column for the chrominance component
-* \param src pointer to picture data
-* \param offset offset value for picture data
-* \param tc tc value
-* \param maskP indicator to disable filtering on partP
-* \param maskQ indicator to disable filtering on partQ */
-static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
- for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
- {
- int16_t m4 = (int16_t)src[0];
- int16_t m3 = (int16_t)src[-offset];
- int16_t m5 = (int16_t)src[offset];
- int16_t m2 = (int16_t)src[-offset * 2];
-
- int32_t delta = x265_clip3(-tc, tc, ((((m4 - m3) * 4) + m2 - m5 + 4) >> 3));
- src[-offset] = x265_clip(m3 + (delta & maskP));
- src[0] = x265_clip(m4 - (delta & maskQ));
- }
-}
-
void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
int32_t maskP, int32_t maskQ)
{
@@ -219,6 +198,50 @@ void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t
src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ));
}
+void pelFilterChroma_H_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc,
+ int32_t maskP, int32_t maskQ)
+{
+ X265_CHECK(srcStep == 1, "srcStep value must be 1 for Chroma Horizontal\n");
+
+ (void)srcStep;
+
+ int16_t m2 = (int16_t)src[0 - offset * 2];
+ int16_t m3 = (int16_t)src[0 - offset * 1];
+ int16_t m4 = (int16_t)src[0 + offset * 0];
+ int16_t m5 = (int16_t)src[0 + offset * 1];
+
+ int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[0 - offset * 1] = x265_clip(m3 + (delta & maskP));
+ src[0 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[1 - offset * 2];
+ m3 = (int16_t)src[1 - offset * 1];
+ m4 = (int16_t)src[1 + offset * 0];
+ m5 = (int16_t)src[1 + offset * 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[1 - offset * 1] = x265_clip(m3 + (delta & maskP));
+ src[1 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[2 - offset * 2];
+ m3 = (int16_t)src[2 - offset * 1];
+ m4 = (int16_t)src[2 + offset * 0];
+ m5 = (int16_t)src[2 + offset * 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[2 - offset * 1] = x265_clip(m3 + (delta & maskP));
+ src[2 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+
+ m2 = (int16_t)src[3 - offset * 2];
+ m3 = (int16_t)src[3 - offset * 1];
+ m4 = (int16_t)src[3 + offset * 0];
+ m5 = (int16_t)src[3 + offset * 1];
+
+ delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3);
+ src[3 - offset * 1] = x265_clip(m3 + (delta & maskP));
+ src[3 + offset * 0] = x265_clip(m4 - (delta & maskQ));
+}
+
}
namespace X265_NS {
@@ -238,6 +261,6 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p)
p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
p.pelFilterChroma[0] = pelFilterChroma_V_c;
- p.pelFilterChroma[1] = pelFilterChroma_c;
+ p.pelFilterChroma[1] = pelFilterChroma_H_c;
}
}
--
2.34.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v3-0005-Unroll-C-implementation-of-pelFilterChroma_H.patch
Type: text/x-diff
Size: 4234 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250218/d5b14bca/attachment.patch>
More information about the x265-devel
mailing list