[x265] [PATCH 2/3] AArch64: Refactor loop unroll pragmas in dct primitives
Hari Limaye
hari.limaye at arm.com
Tue Aug 20 17:42:12 UTC 2024
Make #pragma unroll directives portable for Clang and GCC, as currently
GCC will simply ignore the unsupported directives.
---
source/common/aarch64/dct-prim.cpp | 36 ++++++++++++++++++------------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp
index 416532e54..acc50d4f4 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -5,6 +5,14 @@
#include <arm_neon.h>
+#define X265_PRAGMA(text) _Pragma(#text)
+#if defined(__clang__)
+#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(unroll(n))
+#elif defined(__GNUC__)
+#define X265_PRAGMA_UNROLL(n) X265_PRAGMA(GCC unroll (n))
+#else
+#define X265_PRAGMA_UNROLL(n)
+#endif
namespace
{
@@ -472,12 +480,12 @@ static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst
const int add = 1 << (shift - 1);
-#pragma unroll(4)
+X265_PRAGMA_UNROLL(4)
for (j = 0; j < line; j += 4)
{
/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
for (k = 0; k < 2; k++)
{
int32x4_t s;
@@ -496,7 +504,7 @@ static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst
EE[3] = vsubq_s32(EEE[0] , EEO[0]);
-#pragma unroll(1)
+X265_PRAGMA_UNROLL(1)
for (k = 0; k < 4; k += 4)
{
int32x4_t s[4];
@@ -522,14 +530,14 @@ static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst
static const int32x4_t max = vdupq_n_s32(32767);
const int32x4_t minus_shift = vdupq_n_s32(-shift);
-#pragma unroll(4)
+X265_PRAGMA_UNROLL(4)
for (k = 0; k < 4; k++)
{
E[k] = vaddq_s32(EE[k] , EO[k]);
E[k + 4] = vsubq_s32(EE[3 - k] , EO[3 - k]);
}
-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
for (k = 0; k < 8; k += 4)
{
int32x4_t s[4];
@@ -584,7 +592,7 @@ static void partialButterflyInverse16_neon(const int16_t *src, int16_t *orig_dst
}
-#pragma unroll(2)
+X265_PRAGMA_UNROLL(2)
for (k = 0; k < 8; k += 4)
{
int32x4_t t;
@@ -657,10 +665,10 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
int16x4_t dst[32];
int add = 1 << (shift - 1);
-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
for (j = 0; j < line; j += 4)
{
-#pragma unroll (4)
+X265_PRAGMA_UNROLL(4)
for (k = 0; k < 16; k += 4)
{
int32x4_t s[4];
@@ -681,7 +689,7 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
}
-#pragma unroll (2)
+X265_PRAGMA_UNROLL(2)
for (k = 0; k < 8; k += 4)
{
int32x4_t s[4];
@@ -721,7 +729,7 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
EEO[k + 3] = s[3];
}
-#pragma unroll (2)
+X265_PRAGMA_UNROLL(2)
for (k = 0; k < 2; k++)
{
int32x4_t s;
@@ -736,14 +744,14 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
EEE[1] = vaddq_s32(EEEE[1], EEEO[1]);
EEE[2] = vsubq_s32(EEEE[1], EEEO[1]);
-#pragma unroll (4)
+X265_PRAGMA_UNROLL(4)
for (k = 0; k < 4; k++)
{
EE[k] = vaddq_s32(EEE[k], EEO[k]);
EE[k + 4] = vsubq_s32((EEE[3 - k]), (EEO[3 - k]));
}
-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
for (k = 0; k < 8; k++)
{
E[k] = vaddq_s32(EE[k], EO[k]);
@@ -755,7 +763,7 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
-#pragma unroll (16)
+X265_PRAGMA_UNROLL(16)
for (k = 0; k < 16; k++)
{
int32x4_t adde = vaddq_s32(vdupq_n_s32(add), E[k]);
@@ -777,7 +785,7 @@ static void partialButterflyInverse32_neon(const int16_t *src, int16_t *orig_dst
}
-#pragma unroll (8)
+X265_PRAGMA_UNROLL(8)
for (k = 0; k < 32; k += 4)
{
int16x4_t x0 = dst[k + 0];
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0002-AArch64-Refactor-loop-unroll-pragmas-in-dct-primitiv.patch
Type: text/x-patch
Size: 4615 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240820/66567e9f/attachment.bin>
More information about the x265-devel
mailing list