[vlc-devel] [PATCH 18/19] i420_yuy2, i422_yuy2, i420_rgb: optimise AVX2 use with vzeroupper use
Lyndon Brown
jnqnfe at gmail.com
Thu Sep 24 21:42:36 CEST 2020
From: Lyndon Brown <jnqnfe at gmail.com>
Date: Thu, 4 Apr 2019 07:51:04 +0100
Subject: i420_yuy2,i422_yuy2,i420_rgb: optimise AVX2 use with vzeroupper use
helps performance should SSE code also be in use, say from a filter
diff --git a/modules/video_chroma/i420_rgb_avx2.h b/modules/video_chroma/i420_rgb_avx2.h
index 42e12fd504..5aed858e16 100644
--- a/modules/video_chroma/i420_rgb_avx2.h
+++ b/modules/video_chroma/i420_rgb_avx2.h
@@ -36,7 +36,10 @@
"ymm4", "ymm5", "ymm6", "ymm7" ); \
} while(0)
-#define AVX2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+ sfence \n\
+ vzeroupper \n\
+ " ::: "memory" )
#define AVX2_INIT_16_ALIGNED " \n\
vmovdqa (%[u]), %%xmm0 # Load 16 Cb into lower half ... u2 u1 u0 \n\
@@ -390,7 +393,9 @@ vmovdqu %%ymm3, 96(%[b]) # Store ABGR31 ... ABGR24
AVX2_INSTRUCTIONS \
} while(0)
-#define AVX2_END _mm_sfence()
+#define AVX2_END \
+ _mm_sfence(); \
+ _mm256_zeroupper();
#define AVX2_INIT_16_ALIGNED \
ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h
index 406d8f094b..c96ca5730f 100644
--- a/modules/video_chroma/i420_yuy2.h
+++ b/modules/video_chroma/i420_yuy2.h
@@ -306,7 +306,10 @@ movdqu %%xmm1, 16(%1) # Store high UYVY \n\
p_u += 16; p_v += 16; \
} while(0)
-#define AVX2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+ sfence \n\
+ vzeroupper \n\
+ " ::: "memory" )
#define AVX2_INIT_ALIGNED " \n\
vmovdqa (%[y1]), %%ymm0 # Load 32 Y1 ... y2 y1 y0 \n\
@@ -411,7 +414,9 @@ vmovdqu %%ymm4, 32(%[l2]) # Store high UYVY
p_u += 16; p_v += 16; \
} while(0)
-#define AVX2_END _mm_sfence()
+#define AVX2_END \
+ _mm_sfence(); \
+ _mm256_zeroupper();
#define AVX2_INIT_ALIGNED \
ymm0 = _mm256_load_si256((__m256i *)p_y1); \
diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h
index 72bea03284..e3984ae635 100644
--- a/modules/video_chroma/i422_yuy2.h
+++ b/modules/video_chroma/i422_yuy2.h
@@ -221,7 +221,10 @@ movdqu %%xmm1, 16(%0) # Store high UYVY \n\
p_u += 16; p_v += 16; \
} while(0)
-#define AVX2_END __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+ sfence \n\
+ vzeroupper \n\
+ " ::: "memory" )
#define AVX2_INIT_ALIGNED " \n\
vmovdqa (%[y]), %%ymm0 # Load 32 Y ... y2 y1 y0 \n\
@@ -298,7 +301,9 @@ vmovdqu %%ymm1, 32(%[l]) # Store high UYVY
p_u += 16; p_v += 16; \
} while(0)
-#define AVX2_END _mm_sfence()
+#define AVX2_END \
+ _mm_sfence(); \
+ _mm256_zeroupper();
#define AVX2_INIT_ALIGNED \
ymm0 = _mm256_load_si256((__m256i *)p_y); \
More information about the vlc-devel
mailing list