[vlc-devel] [PATCH 18/19] i420_yuy2, i422_yuy2, i420_rgb: optimise AVX2 use with vzeroupper use

Lyndon Brown jnqnfe at gmail.com
Thu Sep 24 21:42:36 CEST 2020


From: Lyndon Brown <jnqnfe at gmail.com>
Date: Thu, 4 Apr 2019 07:51:04 +0100
Subject: i420_yuy2,i422_yuy2,i420_rgb: optimise AVX2 use with vzeroupper use

helps performance should SSE code also be in use, say from a filter

diff --git a/modules/video_chroma/i420_rgb_avx2.h b/modules/video_chroma/i420_rgb_avx2.h
index 42e12fd504..5aed858e16 100644
--- a/modules/video_chroma/i420_rgb_avx2.h
+++ b/modules/video_chroma/i420_rgb_avx2.h
@@ -36,7 +36,10 @@
                  "ymm4", "ymm5", "ymm6", "ymm7" ); \
     } while(0)
 
-#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+    sfence                              \n\
+    vzeroupper                          \n\
+    " ::: "memory" )
 
 #define AVX2_INIT_16_ALIGNED "                                                \n\
 vmovdqa     (%[u]), %%xmm0   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
@@ -390,7 +393,9 @@ vmovdqu    %%ymm3, 96(%[b])        # Store ABGR31 ... ABGR24
         AVX2_INSTRUCTIONS                   \
     } while(0)
 
-#define AVX2_END  _mm_sfence()
+#define AVX2_END  \
+    _mm_sfence(); \
+    _mm256_zeroupper();
 
 #define AVX2_INIT_16_ALIGNED                       \
     ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h
index 406d8f094b..c96ca5730f 100644
--- a/modules/video_chroma/i420_yuy2.h
+++ b/modules/video_chroma/i420_yuy2.h
@@ -306,7 +306,10 @@ movdqu    %%xmm1, 16(%1)  # Store high UYVY                             \n\
         p_u += 16; p_v += 16;            \
     } while(0)
 
-#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+    sfence                              \n\
+    vzeroupper                          \n\
+    " ::: "memory" )
 
 #define AVX2_INIT_ALIGNED "                                                   \n\
 vmovdqa     (%[y1]), %%ymm0  # Load 32 Y1                     ... y2  y1  y0  \n\
@@ -411,7 +414,9 @@ vmovdqu    %%ymm4, 32(%[l2])       # Store high UYVY
         p_u += 16; p_v += 16;                   \
     } while(0)
 
-#define AVX2_END  _mm_sfence()
+#define AVX2_END  \
+    _mm_sfence(); \
+    _mm256_zeroupper();
 
 #define AVX2_INIT_ALIGNED                       \
     ymm0 = _mm256_load_si256((__m256i *)p_y1);  \
diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h
index 72bea03284..e3984ae635 100644
--- a/modules/video_chroma/i422_yuy2.h
+++ b/modules/video_chroma/i422_yuy2.h
@@ -221,7 +221,10 @@ movdqu    %%xmm1, 16(%0)  # Store high UYVY                             \n\
         p_u += 16; p_v += 16;               \
     } while(0)
 
-#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+#define AVX2_END __asm__ __volatile__ ( " \
+    sfence                              \n\
+    vzeroupper                          \n\
+    " ::: "memory" )
 
 #define AVX2_INIT_ALIGNED "                                                    \n\
 vmovdqa      (%[y]), %%ymm0  # Load 32 Y                      ...  y2  y1  y0  \n\
@@ -298,7 +301,9 @@ vmovdqu    %%ymm1, 32(%[l])        # Store high UYVY
         p_u += 16; p_v += 16;           \
     } while(0)
 
-#define AVX2_END  _mm_sfence()
+#define AVX2_END  \
+    _mm_sfence(); \
+    _mm256_zeroupper();
 
 #define AVX2_INIT_ALIGNED                      \
     ymm0 = _mm256_load_si256((__m256i *)p_y);  \



More information about the vlc-devel mailing list