[vlc-devel] [PATCH 6/19] i420_yuy2, i422_yuy2, i420_rgb: add AVX2 acceleration

Lyndon Brown jnqnfe at gmail.com
Thu Sep 24 21:36:57 CEST 2020


From: Lyndon Brown <jnqnfe at gmail.com>
Date: Sun, 20 Jan 2019 23:46:37 +0000
Subject: i420_yuy2,i422_yuy2,i420_rgb: add AVX2 acceleration

[revised edition]
 - corrected the misunderstanding that NASM syntax could be used
   as normal inline asm by switching back to MASM syntax (apparently
   GCC does support intel syntax, but it's not up to scratch); a
   proper switch to NASM which requires moving to separate files
   for asm can be done later perhaps.
 - fixed a few small bugs

diff --git a/modules/video_chroma/Makefile.am b/modules/video_chroma/Makefile.am
index 30974e589e..4b51140677 100644
--- a/modules/video_chroma/Makefile.am
+++ b/modules/video_chroma/Makefile.am
@@ -110,6 +110,26 @@ chroma_LTLIBRARIES += \
 	libi422_yuy2_sse2_plugin.la
 endif
 
+# AVX2
+libi420_rgb_avx2_plugin_la_SOURCES = video_chroma/i420_rgb.c video_chroma/i420_rgb.h \
+	video_chroma/i420_rgb16_x86.c video_chroma/i420_rgb_avx2.h
+libi420_rgb_avx2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -DAVX2
+
+libi420_yuy2_avx2_plugin_la_SOURCES = video_chroma/i420_yuy2.c video_chroma/i420_yuy2.h
+libi420_yuy2_avx2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) \
+	-DMODULE_NAME_IS_i420_yuy2_avx2
+
+libi422_yuy2_avx2_plugin_la_SOURCES = video_chroma/i422_yuy2.c video_chroma/i422_yuy2.h
+libi422_yuy2_avx2_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) \
+	-DMODULE_NAME_IS_i422_yuy2_avx2
+
+if HAVE_AVX2
+chroma_LTLIBRARIES += \
+	libi420_rgb_avx2_plugin.la \
+	libi420_yuy2_avx2_plugin.la \
+	libi422_yuy2_avx2_plugin.la
+endif
+
 libcvpx_plugin_la_SOURCES = codec/vt_utils.c codec/vt_utils.h video_chroma/cvpx.c
 if HAVE_IOS
 libcvpx_plugin_la_CFLAGS = $(AM_CFLAGS) -miphoneos-version-min=8.0
diff --git a/modules/video_chroma/i420_rgb.c b/modules/video_chroma/i420_rgb.c
index 24cc324731..b1ed2ac57f 100644
--- a/modules/video_chroma/i420_rgb.c
+++ b/modules/video_chroma/i420_rgb.c
@@ -68,7 +68,12 @@ static int  Activate   ( vlc_object_t * );
 static void Deactivate ( vlc_object_t * );
 
 vlc_module_begin ()
-#if defined (SSE2)
+#if defined (AVX2)
+    set_description( N_( "AVX2 I420,IYUV,YV12 to "
+                        "RV15,RV16,RV32 conversions") )
+    set_capability( "video converter", 130 )
+# define vlc_CPU_capable() vlc_CPU_AVX2()
+#elif defined (SSE2)
     set_description( N_( "SSE2 I420,IYUV,YV12 to "
                         "RV15,RV16,RV24,RV32 conversions") )
     set_capability( "video converter", 120 )
diff --git a/modules/video_chroma/i420_rgb.h b/modules/video_chroma/i420_rgb.h
index 8c612e3091..aeb9ac1c2a 100644
--- a/modules/video_chroma/i420_rgb.h
+++ b/modules/video_chroma/i420_rgb.h
@@ -21,7 +21,7 @@
  *****************************************************************************/
 #include <limits.h>
 
-#if !defined (SSE2) && !defined (MMX)
+#if !defined (AVX2) && !defined (SSE2) && !defined (MMX)
 # define PLAIN
 #endif
 
diff --git a/modules/video_chroma/i420_rgb16_x86.c b/modules/video_chroma/i420_rgb16_x86.c
index 35d3272baf..037ed5ad1b 100644
--- a/modules/video_chroma/i420_rgb16_x86.c
+++ b/modules/video_chroma/i420_rgb16_x86.c
@@ -1,7 +1,7 @@
 /*****************************************************************************
  * i420_rgb16_x86.c : YUV to bitmap RGB conversion module for vlc
  *****************************************************************************
- * Copyright (C) 2000 VLC authors and VideoLAN
+ * Copyright (C) 2000, 2019 VLC authors and VideoLAN
  *
  * Authors: Samuel Hocevar <sam at zoy.org>
  *          Damien Fouilleul <damienf at videolan.org>
@@ -31,12 +31,17 @@
 #include <vlc_cpu.h>
 
 #include "i420_rgb.h"
-#ifdef SSE2
-# include "i420_rgb_sse2.h"
-# define VLC_TARGET VLC_SSE
+#ifdef AVX2
+# include "i420_rgb_avx2.h"
+# define VLC_TARGET VLC_AVX
 #else
-# include "i420_rgb_mmx.h"
-# define VLC_TARGET VLC_MMX
+# ifdef SSE2
+#  include "i420_rgb_sse2.h"
+#  define VLC_TARGET VLC_SSE
+# else
+#  include "i420_rgb_mmx.h"
+#  define VLC_TARGET VLC_MMX
+# endif
 #endif
 
 /*****************************************************************************
@@ -161,16 +166,127 @@ void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_16_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_15_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                AVX2_CALL (
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_15_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_15_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                AVX2_CALL (
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_15_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -281,7 +397,7 @@ void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else /* SSE2 */
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -335,8 +451,7 @@ void I420_R5G5B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     }
     /* re-enable FPU registers */
     MMX_END;
-
-#endif /* SSE2 */
+#endif
 }
 
 VLC_TARGET
@@ -401,16 +516,127 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/32; i_x--; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_16_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_16_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                AVX2_CALL(
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_16_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)/32; i_x--; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_16_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+
+                AVX2_CALL(
+                    AVX2_INIT_16_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_16_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 2 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -521,7 +747,7 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else /* SSE2 */
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -575,8 +801,7 @@ void I420_R5G6B5( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     }
     /* re-enable FPU registers */
     MMX_END;
-
-#endif /* SSE2 */
+#endif
 }
 
 VLC_TARGET
@@ -641,16 +866,127 @@ void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ARGB_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ARGB_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -761,7 +1097,7 @@ void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -815,7 +1151,6 @@ void I420_A8R8G8B8( filter_t *p_filter, picture_t *p_src,
 
     /* re-enable FPU registers */
     MMX_END;
-
 #endif
 }
 
@@ -880,16 +1215,127 @@ void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_RGBA_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_RGBA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -1000,7 +1446,7 @@ void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -1054,7 +1500,6 @@ void I420_R8G8B8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 
     /* re-enable FPU registers */
     MMX_END;
-
 #endif
 }
 
@@ -1119,16 +1564,127 @@ void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_BGRA_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_BGRA_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -1239,7 +1795,7 @@ void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -1293,7 +1849,6 @@ void I420_B8G8R8A8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 
     /* re-enable FPU registers */
     MMX_END;
-
 #endif
 }
 
@@ -1358,16 +1913,127 @@ void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
                     (p_filter->fmt_out.video.i_y_offset + p_filter->fmt_out.video.i_visible_height) :
                     (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height);
 
-#ifdef SSE2
+    p_buffer = b_hscale ? p_buffer_start : p_pic;
 
-    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+#if defined (AVX2)
 
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 31;
 
-    p_buffer = b_hscale ? p_buffer_start : p_pic;
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_src->p[Y_PLANE].i_pitch|
+                    p_dest->p->i_pitch|
+                    ((intptr_t)p_y)|
+                    ((intptr_t)p_buffer))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_ALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ABGR_ALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = 0; i_y < (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height); i_y++ )
+        {
+            p_pic_start = p_pic;
+
+            for ( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32; i_x--; )
+            {
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+                p_buffer += 32;
+            }
+
+            /* Here we do some unaligned reads and duplicate conversions, but
+             * at least we have all the pixels */
+            if( i_rewind )
+            {
+                p_y -= i_rewind;
+                p_u -= i_rewind >> 1;
+                p_v -= i_rewind >> 1;
+                p_buffer -= i_rewind;
+                AVX2_CALL (
+                    AVX2_INIT_32_UNALIGNED
+                    AVX2_YUV_MUL
+                    AVX2_YUV_ADD
+                    AVX2_UNPACK_32_ABGR_UNALIGNED
+                );
+                p_y += 32;
+                p_u += 16;
+                p_v += 16;
+            }
+            SCALE_WIDTH;
+            SCALE_HEIGHT( 420, 4 );
+
+            p_y += i_source_margin;
+            if( i_y % 2 )
+            {
+                p_u += i_source_margin_c;
+                p_v += i_source_margin_c;
+            }
+            p_buffer = b_hscale ? p_buffer_start : p_pic;
+        }
+    }
+
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+
+#elif defined (SSE2)
+
+    i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 15;
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch|
                     p_dest->p->i_pitch|
@@ -1478,7 +2144,7 @@ void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#else
+#else /* MMX */
 
     i_rewind = (-(p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width)) & 7;
 
@@ -1532,6 +2198,5 @@ void I420_A8B8G8R8( filter_t *p_filter, picture_t *p_src, picture_t *p_dest )
 
     /* re-enable FPU registers */
     MMX_END;
-
 #endif
 }
diff --git a/modules/video_chroma/i420_rgb_avx2.h b/modules/video_chroma/i420_rgb_avx2.h
new file mode 100644
index 0000000000..42e12fd504
--- /dev/null
+++ b/modules/video_chroma/i420_rgb_avx2.h
@@ -0,0 +1,670 @@
+/*****************************************************************************
+ * i420_rgb_avx2.h: AVX2 YUV transformation assembly
+ *****************************************************************************
+ * Copyright (C) 1999-2012, 2019 VLC authors and VideoLAN
+ *
+ * Authors: Damien Fouilleul <damienf at videolan.org>
+ *          Lyndon Brown <jnqnfe at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+#if defined(CAN_COMPILE_AVX2)
+
+/* AVX2 assembly */
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)    \
+    do {                                \
+    __asm__ __volatile__(               \
+        ".p2align 3 \n\t"               \
+        AVX2_INSTRUCTIONS               \
+        :                               \
+        : [y]"r"(p_y), [u]"r"(p_u),     \
+          [v]"r"(p_v), [b]"r"(p_buffer) \
+        : "eax", "ymm0", "ymm1", "ymm2", "ymm3", \
+                 "ymm4", "ymm5", "ymm6", "ymm7" ); \
+    } while(0)
+
+#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+
+#define AVX2_INIT_16_ALIGNED "                                                \n\
+vmovdqa     (%[u]), %%xmm0   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqa     (%[v]), %%xmm1   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+vmovdqa     (%[y]), %%ymm6   # Load 32 Y                      ... y2  y1  y0  \n\
+"
+
+#define AVX2_INIT_16_UNALIGNED "                                              \n\
+vmovdqu     (%[u]), %%xmm0   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqu     (%[v]), %%xmm1   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+vmovdqu     (%[y]), %%ymm6   # Load 32 Y                      ... y2  y1  y0  \n\
+prefetchnta (%[b])           # Tell CPU not to cache output RGB data          \n\
+"
+
+#define AVX2_INIT_32_ALIGNED "                                                \n\
+vmovdqa     (%[u]), %%xmm0   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqa     (%[v]), %%xmm1   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+vmovdqa     (%[y]), %%ymm6   # Load 32 Y                      ... y2  y1  y0  \n\
+"
+
+#define AVX2_INIT_32_UNALIGNED "                                              \n\
+vmovdqu     (%[u]), %%xmm0   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqu     (%[v]), %%xmm1   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+vmovdqu     (%[y]), %%ymm6   # Load 32 Y                      ... y2  y1  y0  \n\
+prefetchnta (%[b])           # Tell CPU not to cache output RGB data          \n\
+"
+
+#define AVX2_YUV_MUL "                                                              \n\
+# convert the chroma part                                                           \n\
+vpmovzxbw  %%xmm0, %%ymm0          # Zero extend u                 ... 00 u1 00 u0  \n\
+vpmovzxbw  %%xmm1, %%ymm1          # Zero extend v                 ... 00 v1 00 v0  \n\
+mov        $0x00800080, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # Set ymm5 to                   ... 00 80 00 80  \n\
+vpsubsw    %%ymm5, %%ymm0, %%ymm0  # Cb -= 128                                      \n\
+vpsubsw    %%ymm5, %%ymm1, %%ymm1  # Cr -= 128                                      \n\
+vpsllw     $3, %%ymm0, %%ymm0      # Promote precision                              \n\
+vpsllw     $3, %%ymm1, %%ymm1      # Promote precision                              \n\
+mov        $0xf37df37d, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm4           #                                                \n\
+vpshufd    $0, %%ymm4, %%ymm4      # Set ymm4 to                   ... f3 7d f3 7d  \n\
+vpmulhw    %%ymm4, %%ymm0, %%ymm2  # Mul Cb with green coeff -> Cb green            \n\
+mov        $0xe5fce5fc, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # Set ymm5 to                   ... e5 fc e5 fc  \n\
+vpmulhw    %%ymm5, %%ymm1, %%ymm3  # Mul Cr with green coeff -> Cr green            \n\
+mov        $0x40934093, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm4           #                                                \n\
+vpshufd    $0, %%ymm4, %%ymm4      # Set ymm4 to                   ... 40 93 40 93  \n\
+vpmulhw    %%ymm4, %%ymm0, %%ymm0  # Mul Cb -> Cblue               ... 00 b1 00 b0  \n\
+mov        $0x33123312, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # Set ymm5 to                   ... 33 12 33 12  \n\
+vpmulhw    %%ymm5, %%ymm1, %%ymm1  # Mul Cr -> Cred                ... 00 r1 00 r0  \n\
+vpaddsw    %%ymm3, %%ymm2, %%ymm2  # Cb green + Cr green -> Cgreen                  \n\
+                                                                                    \n\
+# convert the luma part                                                             \n\
+mov        $0x10101010, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # Set ymm5 to                   ... 10 10 10 10  \n\
+vpsubusb   %%ymm5, %%ymm6, %%ymm6  # Y -= 16                                        \n\
+vpsrlw     $8, %%ymm6, %%ymm7      # get Y odd                     ... 00 y3 00 y1  \n\
+mov        $0x00ff00ff, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to                   ... 00 ff 00 ff  \n\
+vpand      %%ymm5, %%ymm6, %%ymm6  # get Y even                    ... 00 y2 00 y0  \n\
+vpsllw     $3, %%ymm6, %%ymm6      # Promote precision                              \n\
+vpsllw     $3, %%ymm7, %%ymm7      # Promote precision                              \n\
+mov        $0x253f253f, %%eax      #                                                \n\
+vmovd      %%eax, %%xmm5           #                                                \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to                   ... 25 3f 25 3f  \n\
+vpmulhw    %%ymm5, %%ymm6, %%ymm6  # Mul 16 Y even                 ... 00 y2 00 y0  \n\
+vpmulhw    %%ymm5, %%ymm7, %%ymm7  # Mul 16 Y odd                  ... 00 y3 00 y1  \n\
+"
+
+#define AVX2_YUV_ADD "                                                              \n\
+# Do horizontal and vertical scaling                                                \n\
+vpaddsw    %%ymm7, %%ymm0, %%ymm3  # Y odd  + Cblue                ... 00 B3 00 B1  \n\
+vpaddsw    %%ymm6, %%ymm0, %%ymm0  # Y even + Cblue                ... 00 B2 00 B0  \n\
+vpaddsw    %%ymm7, %%ymm1, %%ymm4  # Y odd  + Cred                 ... 00 R3 00 R1  \n\
+vpaddsw    %%ymm6, %%ymm1, %%ymm1  # Y even + Cred                 ... 00 R2 00 R0  \n\
+vpaddsw    %%ymm7, %%ymm2, %%ymm5  # Y odd  + Cgreen               ... 00 G3 00 G1  \n\
+vpaddsw    %%ymm6, %%ymm2, %%ymm2  # Y even + Cgreen               ... 00 G2 00 G0  \n\
+                                                                                    \n\
+# Limit RGB even to 0..255                                                          \n\
+vpackuswb  %%ymm0, %%ymm0, %%ymm0  # Saturate and pack   ... B4 B2 B0 ... B4 B2 B0  \n\
+vpackuswb  %%ymm1, %%ymm1, %%ymm1  # Saturate and pack   ... R4 R2 R0 ... R4 R2 R0  \n\
+vpackuswb  %%ymm2, %%ymm2, %%ymm2  # Saturate and pack   ... G4 G2 G0 ... G4 G2 G0  \n\
+                                                                                    \n\
+# Limit RGB odd to 0..255                                                           \n\
+vpackuswb  %%ymm3, %%ymm3, %%ymm3  # Saturate and pack   ... B5 B3 B1 ... B5 B3 B1  \n\
+vpackuswb  %%ymm4, %%ymm4, %%ymm4  # Saturate and pack   ... R5 R3 R1 ... R5 R3 R1  \n\
+vpackuswb  %%ymm5, %%ymm5, %%ymm5  # Saturate and pack   ... G5 G3 G1 ... G5 G3 G1  \n\
+                                                                                    \n\
+# Interleave RGB even and odd                                                       \n\
+vpunpcklbw %%ymm3, %%ymm0, %%ymm0  #                                  ... B2 B1 B0  \n\
+vpunpcklbw %%ymm4, %%ymm1, %%ymm1  #                                  ... R2 R1 R0  \n\
+vpunpcklbw %%ymm5, %%ymm2, %%ymm2  #                                  ... G2 G1 G0  \n\
+"
+
+#define AVX2_UNPACK_15_ALIGNED "   # Note, much of this shows bit patterns (of a pair of bytes) \n\
+# mask unneeded bits off                                                    \n\
+mov        $0xf8f8f8f8, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm5           #                                        \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to     f8 f8 ... f8 f8 f8 f8  \n\
+vpand      %%ymm5, %%ymm0, %%ymm0  # b7b6b5b4 b3______ b7b6b5b4 b3______    \n\
+vpsrlw     $3, %%ymm0, %%ymm0      # ______b7 b6b5b4b3 ______b7 b6b5b4b3    \n\
+vpand      %%ymm5, %%ymm2, %%ymm2  # g7g6g5g4 g3______ g7g6g5g4 g3______    \n\
+vpand      %%ymm5, %%ymm1, %%ymm1  # r7r6r5r4 r3______ r7r6r5r4 r3______    \n\
+vpsrlw     $1, %%ymm1, %%ymm1      # __r7r6r5 r4r3____ __r7r6r5 r4r3____    \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 0-15                      \n\
+vpmovzxbw  %%xmm2, %%ymm5          # ________ ________ g7g6g5g4 g3______    \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm4  # __r7r6r5 r4r3____ ______b7 b6b5b4b3    \n\
+vpsllw     $2, %%ymm5, %%ymm5      # ________ ____g7g6 g5g4g3__ ________    \n\
+vpor       %%ymm5, %%ymm4, %%ymm4  # __r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3    \n\
+vmovntdq   %%ymm4, (%[b])          # store pixels 0-15                      \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 16-31                     \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                               \n\
+vpunpckhbw %%ymm3, %%ymm2, %%ymm7  # ________ ________ g7g6g5g4 g3______    \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm6  # __r7r6r5 r4r3____ ______b7 b6b5b4b3    \n\
+vpsllw     $2, %%ymm7, %%ymm7      # ________ ____g7g6 g5g4g3__ ________    \n\
+vpor       %%ymm7, %%ymm6, %%ymm6  # __r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3    \n\
+vmovntdq   %%ymm6, 32(%[b])        # store pixels 16-31                     \n\
+"
+
+#define AVX2_UNPACK_15_UNALIGNED " # Note, much of this shows bit patterns (of a pair of bytes) \n\
+# mask unneeded bits off                                                    \n\
+mov        $0xf8f8f8f8, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm5           #                                        \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to     f8 f8 ... f8 f8 f8 f8  \n\
+vpand      %%ymm5, %%ymm0, %%ymm0  # b7b6b5b4 b3______ b7b6b5b4 b3______    \n\
+vpsrlw     $3, %%ymm0, %%ymm0      # ______b7 b6b5b4b3 ______b7 b6b5b4b3    \n\
+vpand      %%ymm5, %%ymm2, %%ymm2  # g7g6g5g4 g3______ g7g6g5g4 g3______    \n\
+vpand      %%ymm5, %%ymm1, %%ymm1  # r7r6r5r4 r3______ r7r6r5r4 r3______    \n\
+vpsrlw     $1, %%ymm1, %%ymm1      # __r7r6r5 r4r3____ __r7r6r5 r4r3____    \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 0-15                      \n\
+vpmovzxbw  %%xmm2, %%ymm5          # ________ ________ g7g6g5g4 g3______    \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm4  # __r7r6r5 r4r3____ ______b7 b6b5b4b3    \n\
+vpsllw     $2, %%ymm5, %%ymm5      # ________ ____g7g6 g5g4g3__ ________    \n\
+vpor       %%ymm5, %%ymm4, %%ymm4  # __r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3    \n\
+vmovdqu    %%ymm4, (%[b])          # store pixels 0-15                      \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 16-31                     \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                               \n\
+vpunpckhbw %%ymm3, %%ymm2, %%ymm7  # ________ ________ g7g6g5g4 g3______    \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm6  # __r7r6r5 r4r3____ ______b7 b6b5b4b3    \n\
+vpsllw     $2, %%ymm7, %%ymm7      # ________ ____g7g6 g5g4g3__ ________    \n\
+vpor       %%ymm7, %%ymm6, %%ymm6  # __r7r6r5 r4r3g7g6 g5g4g3b7 b6b5b4b3    \n\
+vmovdqu    %%ymm6, 32(%[b])        # store pixels 16-31                     \n\
+"
+
+#define AVX2_UNPACK_16_ALIGNED "   # Note, much of this shows bit patterns (of a pair of bytes) \n\
+# mask unneeded bits off                                                    \n\
+mov        $0xf8f8f8f8, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm5           #                                        \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to     f8 f8 ... f8 f8 f8 f8  \n\
+vpand      %%ymm5, %%ymm0, %%ymm0  # b7b6b5b4 b3______ b7b6b5b4 b3______    \n\
+vpand      %%ymm5, %%ymm1, %%ymm1  # r7r6r5r4 r3______ r7r6r5r4 r3______    \n\
+mov        $0xfcfcfcfc, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm6           #                                        \n\
+vpshufd    $0, %%ymm6, %%ymm6      # set ymm5 to     fc fc ... fc fc fc fc  \n\
+vpand      %%ymm6, %%ymm2, %%ymm2  # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____    \n\
+vpsrlw     $3, %%ymm0, %%ymm0      # ______b7 b6b5b4b3 ______b7 b6b5b4b3    \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 0-15                      \n\
+vpmovzxbw  %%xmm2, %%ymm5          # ________ ________ g7g6g5g4 g3g2____    \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm4  # r7r6r5r4 r3______ ______b7 b6b5b4b3    \n\
+vpsllw     $3, %%ymm5, %%ymm5      # ________ __g7g6g5 g4g3g2__ ________    \n\
+vpor       %%ymm5, %%ymm4, %%ymm4  # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3    \n\
+vmovntdq   %%ymm4, (%[b])          # store pixesl 0-15                      \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 16-31                     \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                               \n\
+vpunpckhbw %%ymm3, %%ymm2, %%ymm7  # ________ ________ g7g6g5g4 g3g2____    \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm6  # r7r6r5r4 r3______ ______b7 b6b5b4b3    \n\
+vpsllw     $3, %%ymm7, %%ymm7      # ________ __g7g6g5 g4g3g2__ ________    \n\
+vpor       %%ymm7, %%ymm6, %%ymm6  # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3    \n\
+vmovntdq   %%ymm6, 32(%[b])        # store pixels 16-31                     \n\
+"
+
+#define AVX2_UNPACK_16_UNALIGNED " # Note, much of this shows bit patterns (of a pair of bytes) \n\
+# mask unneeded bits off                                                    \n\
+mov        $0xf8f8f8f8, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm5           #                                        \n\
+vpshufd    $0, %%ymm5, %%ymm5      # set ymm5 to     f8 f8 ... f8 f8 f8 f8  \n\
+vpand      %%ymm5, %%ymm0, %%ymm0  # b7b6b5b4 b3______ b7b6b5b4 b3______    \n\
+vpand      %%ymm5, %%ymm1, %%ymm1  # r7r6r5r4 r3______ r7r6r5r4 r3______    \n\
+mov        $0xfcfcfcfc, %%eax      #                                        \n\
+vmovd      %%eax, %%xmm6           #                                        \n\
+vpshufd    $0, %%ymm6, %%ymm6      # set ymm5 to     fc fc ... fc fc fc fc  \n\
+vpand      %%ymm6, %%ymm2, %%ymm2  # g7g6g5g4 g3g2____ g7g6g5g4 g3g2____    \n\
+vpsrlw     $3, %%ymm0, %%ymm0      # ______b7 b6b5b4b3 ______b7 b6b5b4b3    \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 0-15                      \n\
+vpmovzxbw  %%xmm2, %%ymm5          # ________ ________ g7g6g5g4 g3g2____    \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm4  # r7r6r5r4 r3______ ______b7 b6b5b4b3    \n\
+vpsllw     $3, %%ymm5, %%ymm5      # ________ __g7g6g5 g4g3g2__ ________    \n\
+vpor       %%ymm5, %%ymm4, %%ymm4  # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3    \n\
+vmovdqu    %%ymm4, (%[b])          # store pixesl 0-15                      \n\
+                                                                            \n\
+# pack the 3 separate RGB bytes into 2 for pixels 16-31                     \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                               \n\
+vpunpckhbw %%ymm3, %%ymm2, %%ymm7  # ________ ________ g7g6g5g4 g3g2____    \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm6  # r7r6r5r4 r3______ ______b7 b6b5b4b3    \n\
+vpsllw     $3, %%ymm7, %%ymm7      # ________ __g7g6g5 g4g3g2__ ________    \n\
+vpor       %%ymm7, %%ymm6, %%ymm6  # r7r6r5r4 r3g7g6g5 g4g3g2b7 b6b5b4b3    \n\
+vmovdqu    %%ymm6, 32(%[b])        # store pixels 16-31                     \n\
+"
+
+#define AVX2_UNPACK_32_ARGB_ALIGNED "                                           \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero ymm3                                  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm4  # Interleave low b,g    ...  G1  B1  G0  B0  \n\
+vpmovzxbw  %%xmm1, %%ymm5          # Zero extend low r     ...  00  R1  00  R0  \n\
+vpunpcklwd %%ymm5, %%ymm4, %%ymm6  # Interleave b,g,r,0    ...  00  R0  G0  B0  \n\
+vmovntdq   %%ymm6, (%[b])          # Store ARGB7 ... ARGB0                      \n\
+vpunpckhwd %%ymm5, %%ymm4, %%ymm7  # Interleave b,g,r,0    ...  00  R8  G8  B8  \n\
+vmovntdq   %%ymm7, 32(%[b])        # Store ARGB15 ... ARGB8                     \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm0  # Interleave high b,g   ... G17 B17 G16 B16  \n\
+vpunpckhbw %%ymm3, %%ymm1, %%ymm1  # Interleave high r,0   ...  00 R17  00 R16  \n\
+vpunpcklwd %%ymm1, %%ymm0, %%ymm2  # Interleave b,g,r,0    ...  00 R16 G16 B16  \n\
+vmovntdq   %%ymm2, 64(%[b])        # Store ARGB23 ... ARGB16                    \n\
+vpunpckhwd %%ymm1, %%ymm0, %%ymm3  # Interleave b,g,r,0    ...  00 R24 G24 B24  \n\
+vmovntdq   %%ymm3, 96(%[b])        # Store ARGB31 ... ARGB24                    \n\
+"
+
+#define AVX2_UNPACK_32_ARGB_UNALIGNED "                                         \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero ymm3                                  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm4  # Interleave low b,g    ...  G1  B1  G0  B0  \n\
+vpmovzxbw  %%xmm1, %%ymm5          # Zero extend low r     ...  00  R1  00  R0  \n\
+vpunpcklwd %%ymm5, %%ymm4, %%ymm6  # Interleave b,g,r,0    ...  00  R0  G0  B0  \n\
+vmovdqu    %%ymm6, (%[b])          # Store ARGB7 ... ARGB0                      \n\
+vpunpckhwd %%ymm5, %%ymm4, %%ymm7  # Interleave b,g,r,0    ...  00  R8  G8  B8  \n\
+vmovdqu    %%ymm7, 32(%[b])        # Store ARGB15 ... ARGB8                     \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm0  # Interleave high b,g   ... G17 B17 G16 B16  \n\
+vpunpckhbw %%ymm3, %%ymm1, %%ymm1  # Interleave high r,0   ...  00 R17  00 R16  \n\
+vpunpcklwd %%ymm1, %%ymm0, %%ymm2  # Interleave b,g,r,0    ...  00 R16 G16 B16  \n\
+vmovdqu    %%ymm2, 64(%[b])        # Store ARGB23 ... ARGB16                    \n\
+vpunpckhwd %%ymm1, %%ymm0, %%ymm3  # Interleave b,g,r,0    ...  00 R24 G24 B24  \n\
+vmovdqu    %%ymm3, 96(%[b])        # Store ARGB31 ... ARGB24                    \n\
+"
+
+#define AVX2_UNPACK_32_RGBA_ALIGNED "                                           \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm4  # Interleave low g,r    ...  R1  G1  R0  G0  \n\
+vpunpcklbw %%ymm0, %%ymm3, %%ymm5  # Interleave low 0,b    ...  B1  00  B0  00  \n\
+vpunpcklwd %%ymm4, %%ymm5, %%ymm6  # Interleave 0,b,g,r    ...  R0  B0  G0  00  \n\
+vmovntdq   %%ymm6, (%[b])          # Store RGBA7 ... RGBA0                      \n\
+vpunpckhwd %%ymm4, %%ymm5, %%ymm7  # Interleave 0,b,g,r    ...  R8  G8  B8  00  \n\
+vmovntdq   %%ymm7, 32(%[b])        # Store RGBA15 ... RGBA8                     \n\
+vpunpckhbw %%ymm1, %%ymm2, %%ymm1  # Interleave high g,r   ... R17 G17 R16 G16  \n\
+vpunpckhbw %%ymm0, %%ymm3, %%ymm0  # Interleave high 0,b   ... B17  00 B16  00  \n\
+vpunpcklwd %%ymm1, %%ymm0, %%ymm2  # Interleave 0,b,g,r    ... R16 G16 B16  00  \n\
+vmovntdq   %%ymm2, 64(%[b])        # Store RGBA23 ... RGBA16                    \n\
+vpunpckhwd %%ymm1, %%ymm0, %%ymm3  # Interleave 0,b,g,r    ... R24 G24 B24  00  \n\
+vmovntdq   %%ymm3, 96(%[b])        # Store RGBA31 ... RGBA24                    \n\
+"
+
+#define AVX2_UNPACK_32_RGBA_UNALIGNED "                                         \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm4  # Interleave low g,r    ...  R1  G1  R0  G0  \n\
+vpunpcklbw %%ymm0, %%ymm3, %%ymm5  # Interleave low 0,b    ...  B1  00  B0  00  \n\
+vpunpcklwd %%ymm4, %%ymm5, %%ymm6  # Interleave 0,b,g,r    ...  R0  B0  G0  00  \n\
+vmovdqu    %%ymm6, (%[b])          # Store RGBA7 ... RGBA0                      \n\
+vpunpckhwd %%ymm4, %%ymm5, %%ymm7  # Interleave 0,b,g,r    ...  R8  G8  B8  00  \n\
+vmovdqu    %%ymm7, 32(%[b])        # Store RGBA15 ... RGBA8                     \n\
+vpunpckhbw %%ymm1, %%ymm2, %%ymm1  # Interleave high g,r   ... R17 G17 R16 G16  \n\
+vpunpckhbw %%ymm0, %%ymm3, %%ymm0  # Interleave high 0,b   ... B17  00 B16  00  \n\
+vpunpcklwd %%ymm1, %%ymm0, %%ymm2  # Interleave 0,b,g,r    ... R16 G16 B16  00  \n\
+vmovdqu    %%ymm2, 64(%[b])        # Store RGBA23 ... RGBA16                    \n\
+vpunpckhwd %%ymm1, %%ymm0, %%ymm3  # Interleave 0,b,g,r    ... R24 G24 B24  00  \n\
+vmovdqu    %%ymm3, 96(%[b])        # Store RGBA31 ... RGBA24                    \n\
+"
+
+#define AVX2_UNPACK_32_BGRA_ALIGNED "                                           \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm0, %%ymm2, %%ymm4  # Interleave low g,b    ...  B1  G1  B0  G0  \n\
+vpunpcklbw %%ymm1, %%ymm3, %%ymm5  # Interleave low 0,r    ...  R1  00  R0  00  \n\
+vpunpcklwd %%ymm4, %%ymm5, %%ymm6  # Interleave 0,r,g,b    ...  B0  G0  R0  00  \n\
+vmovntdq   %%ymm6, (%[b])          # Store BGRA7 ... BGRA0                      \n\
+vpunpckhwd %%ymm4, %%ymm5, %%ymm7  # Interleave 0,r,g,b    ...  B8  G8  R8  00  \n\
+vmovntdq   %%ymm7, 32(%[b])        # Store BGRA15 ... BGRA8                     \n\
+vpunpckhbw %%ymm0, %%ymm2, %%ymm0  # Interleave high g,b   ... B17 G17 B16 G16  \n\
+vpunpckhbw %%ymm1, %%ymm6, %%ymm1  # Interleave high 0,r   ... R17  00 R16  00  \n\
+vpunpcklwd %%ymm0, %%ymm1, %%ymm2  # Interleave 0,r,g,b    ... B16 G16 R16  00  \n\
+vmovntdq   %%ymm2, 64(%[b])        # Store BGRA23 ... BGRA16                    \n\
+vpunpckhwd %%ymm0, %%ymm1, %%ymm3  # Interleave 0,r,g,b    ... B24 G24 R24  00  \n\
+vmovntdq   %%ymm3, 96(%[b])        # Store BGRA31 ... BGRA24                    \n\
+"
+
+#define AVX2_UNPACK_32_BGRA_UNALIGNED "                                         \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm0, %%ymm2, %%ymm4  # Interleave low g,b    ...  B1  G1  B0  G0  \n\
+vpunpcklbw %%ymm1, %%ymm3, %%ymm5  # Interleave low 0,r    ...  R1  00  R0  00  \n\
+vpunpcklwd %%ymm4, %%ymm5, %%ymm6  # Interleave 0,r,g,b    ...  B0  G0  R0  00  \n\
+vmovdqu    %%ymm6, (%[b])          # Store BGRA7 ... BGRA0                      \n\
+vpunpckhwd %%ymm4, %%ymm5, %%ymm7  # Interleave 0,r,g,b    ...  B8  G8  R8  00  \n\
+vmovdqu    %%ymm7, 32(%[b])        # Store BGRA15 ... BGRA8                     \n\
+vpunpckhbw %%ymm0, %%ymm2, %%ymm0  # Interleave high g,b   ... B17 G17 B16 G16  \n\
+vpunpckhbw %%ymm1, %%ymm6, %%ymm1  # Interleave high 0,r   ... R17  00 R16  00  \n\
+vpunpcklwd %%ymm0, %%ymm1, %%ymm2  # Interleave 0,r,g,b    ... B16 G16 R16  00  \n\
+vmovdqu    %%ymm2, 64(%[b])        # Store BGRA23 ... BGRA16                    \n\
+vpunpckhwd %%ymm0, %%ymm1, %%ymm3  # Interleave 0,r,g,b    ... B24 G24 R24  00  \n\
+vmovdqu    %%ymm3, 96(%[b])        # Store BGRA31 ... BGRA24                    \n\
+"
+
+#define AVX2_UNPACK_32_ABGR_ALIGNED "                                           \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm4  # Interleave low r,g    ...  G1  R1  G0  R0  \n\
+vpmovzxbw  %%xmm0, %%ymm5          # Zero extend low b     ...  00  B1  00  B0  \n\
+vpunpcklwd %%ymm5, %%ymm4, %%ymm6  # Interleave r,g,b,0    ...  00  B0  G0  R0  \n\
+vmovntdq   %%ymm6, (%[b])          # Store ABGR7 ... ABGR0                      \n\
+vpunpckhwd %%ymm5, %%ymm4, %%ymm7  # Interleave r,g,b,0    ...  00  B8  G8  R8  \n\
+vmovntdq   %%ymm7, 32(%[b])        # Store ABGR15 ... ABGR8                     \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm1  # Interleave high r,g   ... G17 R17 G16 R16  \n\
+vpunpckhbw %%ymm3, %%ymm0, %%ymm0  # Interleave high b,0   ...  00 B17  00 B16  \n\
+vpunpcklwd %%ymm0, %%ymm1, %%ymm2  # Interleave r,g,b,0    ...  00 B16 G16 R16  \n\
+vmovntdq   %%ymm2, 64(%[b])        # Store ABGR23 ... ABGR16                    \n\
+vpunpckhwd %%ymm0, %%ymm1, %%ymm3  # Interleave r,g,b,0    ...  00 B24 G24 R24  \n\
+vmovntdq   %%ymm3, 96(%[b])        # Store ABGR31 ... ABGR24                    \n\
+"
+
+#define AVX2_UNPACK_32_ABGR_UNALIGNED "                                         \n\
+vpxor      %%ymm3, %%ymm3, %%ymm3  # zero mm3                                   \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm4  # Interleave low r,g    ...  G1  R1  G0  R0  \n\
+vpmovzxbw  %%xmm0, %%ymm5          # Zero extend low b     ...  00  B1  00  B0  \n\
+vpunpcklwd %%ymm5, %%ymm4, %%ymm6  # Interleave r,g,b,0    ...  00  B0  G0  R0  \n\
+vmovdqu    %%ymm6, (%[b])          # Store ABGR7 ... ABGR0                      \n\
+vpunpckhwd %%ymm5, %%ymm4, %%ymm7  # Interleave r,g,b,0    ...  00  B8  G8  R8  \n\
+vmovdqu    %%ymm7, 32(%[b])        # Store ABGR15 ... ABGR8                     \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm1  # Interleave high r,g   ... G17 R17 G16 R16  \n\
+vpunpckhbw %%ymm3, %%ymm0, %%ymm0  # Interleave high b,0   ...  00 B17  00 B16  \n\
+vpunpcklwd %%ymm0, %%ymm1, %%ymm2  # Interleave r,g,b,0    ...  00 B16 G16 R16  \n\
+vmovdqu    %%ymm2, 64(%[b])        # Store ABGR23 ... ABGR16                    \n\
+vpunpckhwd %%ymm0, %%ymm1, %%ymm3  # Interleave r,g,b,0    ...  00 B24 G24 R24  \n\
+vmovdqu    %%ymm3, 96(%[b])        # Store ABGR31 ... ABGR24                    \n\
+"
+
+#elif defined(HAVE_AVX2_INTRINSICS)
+
+/* AVX2 intrinsics */
+
+#include <immintrin.h>
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)        \
+    do {                                    \
+        __m256i ymm0, ymm1, ymm2, ymm3,     \
+                ymm4, ymm5, ymm6, ymm7;     \
+        AVX2_INSTRUCTIONS                   \
+    } while(0)
+
+#define AVX2_END  _mm_sfence()
+
+#define AVX2_INIT_16_ALIGNED                       \
+    ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_v), 0); \
+    ymm6 = _mm256_load_si256((__m256i *)p_y);
+
+#define AVX2_INIT_16_UNALIGNED                     \
+    ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_v), 0); \
+    ymm6 = _mm256_loadu_si256((__m256i *)p_y);     \
+    _mm_prefetch(p_buffer, _MM_HINT_NTA);
+
+#define AVX2_INIT_32_ALIGNED                       \
+    ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_v), 0); \
+    ymm6 = _mm256_load_si256((__m256i *)p_y);
+
+#define AVX2_INIT_32_UNALIGNED                     \
+    ymm0 = _mm256_inserti128_si256(ymm0, *((__m128i*)p_u), 0); \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_v), 0); \
+    ymm6 = _mm256_loadu_si256((__m256i *)p_y);     \
+    _mm_prefetch(p_buffer, _MM_HINT_NTA);
+
+#define AVX2_YUV_MUL                           \
+    ymm0 = _mm256_cvtepu8_epi16(xmm0);         \
+    ymm1 = _mm256_cvtepu8_epi16(xmm1);         \
+    ymm5 = _mm256_set1_epi32(0x00800080UL);    \
+    ymm0 = _mm256_subs_epi16(ymm0, ymm5);      \
+    ymm1 = _mm256_subs_epi16(ymm1, ymm5);      \
+    ymm0 = _mm256_slli_epi16(ymm0, 3);         \
+    ymm1 = _mm256_slli_epi16(ymm1, 3);         \
+    ymm4 = _mm256_set1_epi32(0xf37df37dUL);    \
+    ymm2 = _mm256_mulhi_epi16(ymm0, ymm4);     \
+    ymm5 = _mm256_set1_epi32(0xe5fce5fcUL);    \
+    ymm3 = _mm256_mulhi_epi16(ymm1, ymm5);     \
+    ymm4 = _mm256_set1_epi32(0x40934093UL);    \
+    ymm0 = _mm256_mulhi_epi16(ymm0, ymm4);     \
+    ymm5 = _mm256_set1_epi32(0x33123312UL);    \
+    ymm1 = _mm256_mulhi_epi16(ymm1, ymm5);     \
+    ymm2 = _mm256_adds_epi16(ymm2, ymm3);      \
+    \
+    ymm5 = _mm256_set1_epi32(0x10101010UL);    \
+    ymm6 = _mm256_subs_epu8(ymm6, ymm5);       \
+    ymm7 = _mm256_srli_epi16(ymm6, 8);         \
+    ymm5 = _mm256_set1_epi32(0x00ff00ffUL);    \
+    ymm6 = _mm256_and_si256(ymm6, ymm5);       \
+    ymm6 = _mm256_slli_epi16(ymm6, 3);         \
+    ymm7 = _mm256_slli_epi16(ymm7, 3);         \
+    ymm5 = _mm256_set1_epi32(0x253f253fUL);    \
+    ymm6 = _mm256_mulhi_epi16(ymm6, ymm5);     \
+    ymm7 = _mm256_mulhi_epi16(ymm7, ymm5);
+
+#define AVX2_YUV_ADD                           \
+    ymm3 = _mm256_adds_epi16(ymm0, ymm7);      \
+    ymm0 = _mm256_adds_epi16(ymm0, ymm6);      \
+    ymm4 = _mm256_adds_epi16(ymm1, ymm7);      \
+    ymm1 = _mm256_adds_epi16(ymm1, ymm6);      \
+    ymm5 = _mm256_adds_epi16(ymm2, ymm7);      \
+    ymm2 = _mm256_adds_epi16(ymm2, ymm6);      \
+    \
+    ymm0 = _mm256_packus_epi16(ymm0, ymm0);    \
+    ymm1 = _mm256_packus_epi16(ymm1, ymm1);    \
+    ymm2 = _mm256_packus_epi16(ymm2, ymm2);    \
+    \
+    ymm3 = _mm256_packus_epi16(ymm3, ymm3);    \
+    ymm4 = _mm256_packus_epi16(ymm4, ymm4);    \
+    ymm5 = _mm256_packus_epi16(ymm5, ymm5);    \
+    \
+    ymm0 = _mm256_unpacklo_epi8(ymm0, ymm3);   \
+    ymm1 = _mm256_unpacklo_epi8(ymm1, ymm4);   \
+    ymm2 = _mm256_unpacklo_epi8(ymm2, ymm5);
+
+#define AVX2_UNPACK_15_ALIGNED                         \
+    ymm5 = _mm256_set1_epi32(0xf8f8f8f8UL);            \
+    ymm0 = _mm256_and_si256(ymm0, ymm5);               \
+    ymm0 = _mm256_srli_epi16(ymm0, 3);                 \
+    ymm2 = _mm256_and_si256(ymm2, ymm5);               \
+    ymm1 = _mm256_and_si256(ymm1, ymm5);               \
+    ymm1 = _mm256_srli_epi16(ymm1, 1);                 \
+    \
+    ymm5 = _mm256_cvtepu8_epi16(xmm2);                 \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm1);           \
+    ymm5 = _mm256_slli_epi16(ymm5, 2);                 \
+    ymm4 = _mm256_or_si256(ymm4, ymm5);                \
+    _mm256_stream_si256((__m256i*)p_buffer, ymm4);     \
+    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm7 = _mm256_unpackhi_epi8(ymm2, ymm3);           \
+    ymm6 = _mm256_unpackhi_epi8(ymm0, ymm1);           \
+    ymm7 = _mm256_slli_epi16(ymm7, 2);                 \
+    ymm6 = _mm256_or_si256(ymm6, ymm7);                \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm6);
+
+#define AVX2_UNPACK_15_UNALIGNED                       \
+    ymm5 = _mm256_set1_epi32(0xf8f8f8f8UL);            \
+    ymm0 = _mm256_and_si256(ymm0, ymm5);               \
+    ymm0 = _mm256_srli_epi16(ymm0, 3);                 \
+    ymm2 = _mm256_and_si256(ymm2, ymm5);               \
+    ymm1 = _mm256_and_si256(ymm1, ymm5);               \
+    ymm1 = _mm256_srli_epi16(ymm1, 1);                 \
+    \
+    ymm5 = _mm256_cvtepu8_epi16(xmm2);                 \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm1);           \
+    ymm5 = _mm256_slli_epi16(ymm5, 2);                 \
+    ymm4 = _mm256_or_si256(ymm4, ymm5);                \
+    _mm256_storeu_si256((__m256i*)p_buffer, ymm4);     \
+    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm7 = _mm256_unpackhi_epi8(ymm2, ymm3);           \
+    ymm6 = _mm256_unpackhi_epi8(ymm0, ymm1);           \
+    ymm7 = _mm256_slli_epi16(ymm7, 2);                 \
+    ymm6 = _mm256_or_si256(ymm6, ymm7);                \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm6);
+
+#define AVX2_UNPACK_16_ALIGNED                         \
+    ymm5 = _mm256_set1_epi32(0xf8f8f8f8UL);            \
+    ymm0 = _mm256_and_si256(ymm0, ymm5);               \
+    ymm1 = _mm256_and_si256(ymm1, ymm5);               \
+    ymm6 = _mm256_set1_epi32(0xfcfcfcfcUL);            \
+    ymm2 = _mm256_and_si256(ymm2, ymm6);               \
+    ymm0 = _mm256_srli_epi16(ymm0, 3);                 \
+    \
+    ymm5 = _mm256_cvtepu8_epi16(xmm2);                 \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm1);           \
+    ymm5 = _mm256_slli_epi16(ymm5, 3);                 \
+    ymm4 = _mm256_or_si256(ymm4, ymm5);                \
+    _mm256_stream_si256((__m256i*)p_buffer, ymm4);     \
+    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm7 = _mm256_unpackhi_epi8(ymm2, ymm3);           \
+    ymm6 = _mm256_unpackhi_epi8(ymm0, ymm1);           \
+    ymm7 = _mm256_slli_epi16(ymm7, 3);                 \
+    ymm6 = _mm256_or_si256(ymm6, ymm7);                \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm6);
+
+#define AVX2_UNPACK_16_UNALIGNED                       \
+    ymm5 = _mm256_set1_epi32(0xf8f8f8f8UL);            \
+    ymm0 = _mm256_and_si256(ymm0, ymm5);               \
+    ymm1 = _mm256_and_si256(ymm1, ymm5);               \
+    ymm6 = _mm256_set1_epi32(0xfcfcfcfcUL);            \
+    ymm2 = _mm256_and_si256(ymm2, ymm6);               \
+    ymm0 = _mm256_srli_epi16(ymm0, 3);                 \
+    \
+    ymm5 = _mm256_cvtepu8_epi16(xmm2);                 \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm1);           \
+    ymm5 = _mm256_slli_epi16(ymm5, 3);                 \
+    ymm4 = _mm256_or_si256(ymm4, ymm5);                \
+    _mm256_storeu_si256((__m256i*)p_buffer, ymm4);     \
+    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm7 = _mm256_unpackhi_epi8(ymm2, ymm3);           \
+    ymm6 = _mm256_unpackhi_epi8(ymm0, ymm1);           \
+    ymm7 = _mm256_slli_epi16(ymm7, 3);                 \
+    ymm6 = _mm256_or_si256(ymm6, ymm7);                \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm6);
+
+#define AVX2_UNPACK_32_ARGB_ALIGNED                    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    ymm5 = _mm256_cvtepu8_epi16(xmm1);                 \
+    ymm6 = _mm256_unpacklo_epi16(ymm4, ymm5);          \
+    _mm256_stream_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm4, ymm5);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm0 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm3);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm0, ymm1);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm0, ymm1);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_ARGB_UNALIGNED                  \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    ymm5 = _mm256_cvtepu8_epi16(xmm1);                 \
+    ymm6 = _mm256_unpacklo_epi16(ymm4, ymm5);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm4, ymm5);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm0 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm3);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm0, ymm1);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm0, ymm1);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_RGBA_ALIGNED                    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm2, ymm1);           \
+    ymm5 = _mm256_unpacklo_epi8(ymm3, ymm0);           \
+    ymm6 = _mm256_unpacklo_epi16(ymm5, ymm4);          \
+    _mm256_stream_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm5, ymm4);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm1 = _mm256_unpackhi_epi8(ymm2, ymm1);           \
+    ymm0 = _mm256_unpackhi_epi8(ymm3, ymm0);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm0, ymm1);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm0, ymm1);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_RGBA_UNALIGNED                  \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm2, ymm1);           \
+    ymm5 = _mm256_unpacklo_epi8(ymm3, ymm0);           \
+    ymm6 = _mm256_unpacklo_epi16(ymm5, ymm4);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm5, ymm4);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm1 = _mm256_unpackhi_epi8(ymm2, ymm1);           \
+    ymm0 = _mm256_unpackhi_epi8(ymm3, ymm0);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm0, ymm1);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm0, ymm1);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_BGRA_ALIGNED                    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm2, ymm0);           \
+    ymm5 = _mm256_unpacklo_epi8(ymm3, ymm1);           \
+    ymm6 = _mm256_unpacklo_epi16(ymm5, ymm4);          \
+    _mm256_stream_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm5, ymm4);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm0 = _mm256_unpackhi_epi8(ymm2, ymm0);           \
+    ymm1 = _mm256_unpackhi_epi8(ymm6, ymm1);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm1, ymm0);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm1, ymm0);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_BGRA_UNALIGNED                  \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm2, ymm0);           \
+    ymm5 = _mm256_unpacklo_epi8(ymm3, ymm1);           \
+    ymm6 = _mm256_unpacklo_epi16(ymm5, ymm4);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm5, ymm4);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm0 = _mm256_unpackhi_epi8(ymm2, ymm0);           \
+    ymm1 = _mm256_unpackhi_epi8(ymm6, ymm1);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm1, ymm0);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm1, ymm0);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_ABGR_ALIGNED                    \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    ymm5 = _mm256_cvtepu8_epi16(xmm0);                 \
+    ymm6 = _mm256_unpacklo_epi16(ymm4, ymm5);          \
+    _mm256_stream_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm4, ymm5);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    ymm0 = _mm256_unpackhi_epi8(ymm0, ymm3);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm1, ymm0);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm1, ymm0);          \
+    _mm256_stream_si256((__m256i*)(p_buffer+24), ymm3);
+
+#define AVX2_UNPACK_32_ABGR_UNALIGNED                  \
+    ymm3 = _mm256_setzero_si256();                     \
+    ymm4 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    ymm5 = _mm256_cvtepu8_epi16(xmm0);                 \
+    ymm6 = _mm256_unpacklo_epi16(ymm4, ymm5);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer), ymm6);   \
+    ymm7 = _mm256_unpackhi_epi16(ymm4, ymm5);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+8), ymm7); \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    ymm0 = _mm256_unpackhi_epi8(ymm0, ymm3);           \
+    ymm2 = _mm256_unpacklo_epi16(ymm1, ymm0);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+16), ymm2);\
+    ymm3 = _mm256_unpackhi_epi16(ymm1, ymm0);          \
+    _mm256_storeu_si256((__m256i*)(p_buffer+24), ymm3);
+
+#endif
diff --git a/modules/video_chroma/i420_yuy2.c b/modules/video_chroma/i420_yuy2.c
index 966204f517..f3353e49ce 100644
--- a/modules/video_chroma/i420_yuy2.c
+++ b/modules/video_chroma/i420_yuy2.c
@@ -1,7 +1,7 @@
 /*****************************************************************************
  * i420_yuy2.c : YUV to YUV conversion module for vlc
  *****************************************************************************
- * Copyright (C) 2000, 2001 VLC authors and VideoLAN
+ * Copyright (C) 2000, 2001, 2019 VLC authors and VideoLAN
  *
  * Authors: Samuel Hocevar <sam at zoy.org>
  *          Damien Fouilleul <damien at videolan.org>
@@ -55,6 +55,9 @@
 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec)
 #    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422"
 #    define VLC_TARGET
+#elif defined (MODULE_NAME_IS_i420_yuy2_avx2)
+#    define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV"
+#    define VLC_TARGET VLC_AVX
 #endif
 
 /*****************************************************************************
@@ -98,6 +101,10 @@ vlc_module_begin ()
             _("AltiVec conversions from " SRC_FOURCC " to " DEST_FOURCC) );
     set_capability( "video converter", 250 )
 # define vlc_CPU_capable() vlc_CPU_ALTIVEC()
+#elif defined (MODULE_NAME_IS_i420_yuy2_avx2)
+    set_description( N_("AVX2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
+    set_capability( "video converter", 260 )
+# define vlc_CPU_capable() vlc_CPU_AVX2()
 #endif
     set_callback( Activate )
 vlc_module_end ()
@@ -294,7 +301,7 @@ static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) && !defined(MODULE_NAME_IS_i420_yuy2_avx2)
     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
@@ -334,11 +341,9 @@ static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
     }
 #endif
 
-#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+#elif defined(MODULE_NAME_IS_i420_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line2|(intptr_t)p_y2))) )
@@ -396,7 +401,71 @@ static void I420_YUY2( filter_t *p_filter, picture_t *p_source,
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
 
-#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+#elif defined(MODULE_NAME_IS_i420_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV420_YUYV_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YUYV( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV420_YUYV_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YUYV( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_avx2)
 }
 
 /*****************************************************************************
@@ -503,7 +572,7 @@ static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) && !defined(MODULE_NAME_IS_i420_yuy2_avx2)
     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
@@ -543,11 +612,10 @@ static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
     }
 #endif
 
-#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+#elif defined(MODULE_NAME_IS_i420_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
+
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
@@ -603,7 +671,72 @@ static void I420_YVYU( filter_t *p_filter, picture_t *p_source,
     }
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
-#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+
+#elif defined(MODULE_NAME_IS_i420_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV420_YVYU_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YVYU( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV420_YVYU_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_YVYU( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_avx2)
 }
 
 /*****************************************************************************
@@ -710,7 +843,7 @@ static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if !defined(MODULE_NAME_IS_i420_yuy2_sse2)
+#if !defined(MODULE_NAME_IS_i420_yuy2_sse2) && !defined(MODULE_NAME_IS_i420_yuy2_avx2)
     for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
     {
         p_line1 = p_line2;
@@ -750,11 +883,10 @@ static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
     }
 #endif
 
-#else // defined(MODULE_NAME_IS_i420_yuy2_sse2)
-    /*
-    ** SSE2 128 bits fetch/store instructions are faster
-    ** if memory access is 16 bytes aligned
-    */
+#elif defined(MODULE_NAME_IS_i420_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
+
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line2|(intptr_t)p_y2))) )
     {
@@ -810,7 +942,72 @@ static void I420_UYVY( filter_t *p_filter, picture_t *p_source,
     }
     /* make sure all SSE2 stores are visible thereafter */
     SSE2_END;
-#endif // defined(MODULE_NAME_IS_i420_yuy2_sse2)
+
+#elif defined(MODULE_NAME_IS_i420_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line2|(intptr_t)p_y2))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV420_UYVY_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) / 2 ; i_y-- ; )
+        {
+            p_line1 = p_line2;
+            p_line2 += p_dest->p->i_pitch;
+
+            p_y1 = p_y2;
+            p_y2 += p_source->p[Y_PLANE].i_pitch;
+
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV420_UYVY_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV420_UYVY( );
+            }
+
+            p_y2 += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line2 += i_dest_margin;
+        }
+    }
+    /* make sure all AVX2 stores are visible thereafter */
+    AVX2_END;
+#endif // defined(MODULE_NAME_IS_i420_yuy2_avx2)
 }
 
 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec)
diff --git a/modules/video_chroma/i420_yuy2.h b/modules/video_chroma/i420_yuy2.h
index 736fcab90a..dc18879b8e 100644
--- a/modules/video_chroma/i420_yuy2.h
+++ b/modules/video_chroma/i420_yuy2.h
@@ -1,10 +1,11 @@
 /*****************************************************************************
  * i420_yuy2.h : YUV to YUV conversion module for vlc
  *****************************************************************************
- * Copyright (C) 2000, 2001 VLC authors and VideoLAN
+ * Copyright (C) 2000, 2001, 2019 VLC authors and VideoLAN
  *
  * Authors: Samuel Hocevar <sam at zoy.org>
  *          Damien Fouilleul <damien at videolan.org>
+ *          Lyndon Brown <jnqnfe at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -33,8 +34,8 @@
         ".p2align 3 \n\t                    \
 movd       (%0), %%mm1  # Load 4 Cb           00 00 00 00 u3 u2 u1 u0     \n\
 movd       (%1), %%mm2  # Load 4 Cr           00 00 00 00 v3 v2 v1 v0     \n\
-movq       (%2), %%mm0  # Load 8 Y            y7 y6 y5 y4 y3 y2 y1 y0     \n\
-movq       (%3), %%mm3  # Load 8 Y            Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
+movq       (%2), %%mm0  # Load 8 Y1           y7 y6 y5 y4 y3 y2 y1 y0     \n\
+movq       (%3), %%mm3  # Load 8 Y2           Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0     \n\
 " \
         :                                   \
         : "r" (p_u), "r" (p_v),             \
@@ -198,8 +199,8 @@ movq        (%1), %%xmm2  # Load 8 Cr         00 00 00 ... v2 v1 v0   \n\
 #define SSE2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
 
 #define SSE2_YUV420_YUYV_ALIGNED "                                     \n\
-movdqa      (%2), %%xmm0  # Load 16 Y          yF yE yD ... y2 y1 y0   \n\
-movdqa      (%3), %%xmm3  # Load 16 Y          YF YE YD ... Y2 Y1 Y0   \n\
+movdqa      (%2), %%xmm0  # Load 16 Y1         yF yE yD ... y2 y1 y0   \n\
+movdqa      (%3), %%xmm3  # Load 16 Y2         YF YE YD ... Y2 Y1 Y0   \n\
 punpcklbw %%xmm2, %%xmm1  #                    00 00 ... v1 u1 v0 u0   \n\
 movdqa    %%xmm0, %%xmm2  #                    yF yE yD ... y2 y1 y0   \n\
 punpcklbw %%xmm1, %%xmm2  #                    v3 y7 ... v0 y1 u0 y0   \n\
@@ -214,8 +215,8 @@ movntdq   %%xmm3, 16(%1)  # Store high YUYV                            \n\
 "
 
 #define SSE2_YUV420_YUYV_UNALIGNED "                                   \n\
-movdqu      (%2), %%xmm0  # Load 16 Y          yF yE yD ... y2 y1 y0   \n\
-movdqu      (%3), %%xmm3  # Load 16 Y          YF YE YD ... Y2 Y1 Y0   \n\
+movdqu      (%2), %%xmm0  # Load 16 Y1         yF yE yD ... y2 y1 y0   \n\
+movdqu      (%3), %%xmm3  # Load 16 Y2         YF YE YD ... Y2 Y1 Y0   \n\
 prefetchnta (%0)          # Tell CPU not to cache output YUYV data     \n\
 prefetchnta (%1)          # Tell CPU not to cache output YUYV data     \n\
 punpcklbw %%xmm2, %%xmm1  #                    00 00 ... v1 u1 v0 u0   \n\
@@ -232,8 +233,8 @@ movdqu    %%xmm3, 16(%1)  # Store high YUYV                            \n\
 "
 
 #define SSE2_YUV420_YVYU_ALIGNED "                                     \n\
-movdqa      (%2), %%xmm0  # Load 16 Y          yF yE yD ... y2 y1 y0   \n\
-movdqa      (%3), %%xmm3  # Load 16 Y          YF YE YD ... Y2 Y1 Y0   \n\
+movdqa      (%2), %%xmm0  # Load 16 Y1         yF yE yD ... y2 y1 y0   \n\
+movdqa      (%3), %%xmm3  # Load 16 Y2         YF YE YD ... Y2 Y1 Y0   \n\
 punpcklbw %%xmm1, %%xmm2  #                    u7 v7 ... u1 v1 u0 v0   \n\
 movdqa    %%xmm0, %%xmm1  #                    yF yE yD ... y2 y1 y0   \n\
 punpcklbw %%xmm2, %%xmm1  #                    u3 y7 ... u0 y1 v0 y0   \n\
@@ -248,8 +249,8 @@ movntdq   %%xmm3, 16(%1)  # Store high YUYV                            \n\
 "
 
 #define SSE2_YUV420_YVYU_UNALIGNED "                                    \n\
-movdqu      (%2), %%xmm0  # Load 16 Y           yF yE yD ... y2 y1 y0   \n\
-movdqu      (%3), %%xmm3  # Load 16 Y           YF YE YD ... Y2 Y1 Y0   \n\
+movdqu      (%2), %%xmm0  # Load 16 Y1          yF yE yD ... y2 y1 y0   \n\
+movdqu      (%3), %%xmm3  # Load 16 Y2          YF YE YD ... Y2 Y1 Y0   \n\
 prefetchnta (%0)          # Tell CPU not to cache output YVYU data      \n\
 prefetchnta (%1)          # Tell CPU not to cache output YVYU data      \n\
 punpcklbw %%xmm1, %%xmm2  #                     u7 v7 ... u1 v1 u0 v0   \n\
@@ -266,8 +267,8 @@ movdqu    %%xmm3, 16(%1)  # Store high YUYV                             \n\
 "
 
 #define SSE2_YUV420_UYVY_ALIGNED "                                      \n\
-movdqa      (%2), %%xmm0  # Load 16 Y           yF yE yD ... y2 y1 y0   \n\
-movdqa      (%3), %%xmm3  # Load 16 Y           YF YE YD ... Y2 Y1 Y0   \n\
+movdqa      (%2), %%xmm0  # Load 16 Y1          yF yE yD ... y2 y1 y0   \n\
+movdqa      (%3), %%xmm3  # Load 16 Y2          YF YE YD ... Y2 Y1 Y0   \n\
 punpcklbw %%xmm2, %%xmm1  #                     v7 u7 ... v1 u1 v0 u0   \n\
 movdqa    %%xmm1, %%xmm2  #                     v7 u7 ... v1 u1 v0 u0   \n\
 punpcklbw %%xmm0, %%xmm2  #                     y7 v3 ... y1 v0 y0 u0   \n\
@@ -283,8 +284,8 @@ movntdq   %%xmm1, 16(%1)  # Store high UYVY                             \n\
 "
 
 #define SSE2_YUV420_UYVY_UNALIGNED "                                    \n\
-movdqu      (%2), %%xmm0  # Load 16 Y           yF yE yD ... y2 y1 y0   \n\
-movdqu      (%3), %%xmm3  # Load 16 Y           YF YE YD ... Y2 Y1 Y0   \n\
+movdqu      (%2), %%xmm0  # Load 16 Y1          yF yE yD ... y2 y1 y0   \n\
+movdqu      (%3), %%xmm3  # Load 16 Y2          YF YE YD ... Y2 Y1 Y0   \n\
 prefetchnta (%0)          # Tell CPU not to cache output UYVY data      \n\
 prefetchnta (%1)          # Tell CPU not to cache output UYVY data      \n\
 punpcklbw %%xmm2, %%xmm1  #                     v7 u7 ... v1 u1 v0 u0   \n\
@@ -430,6 +431,216 @@ movdqu    %%xmm1, 16(%1)  # Store high UYVY                             \n\
 
 #endif
 
+#elif defined( MODULE_NAME_IS_i420_yuy2_avx2 )
+
+#if defined(CAN_COMPILE_AVX2)
+
+/* AVX2 assembly */
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)     \
+    do {                                 \
+    __asm__ __volatile__(                \
+        ".p2align 3 \n\t"                \
+        AVX2_INSTRUCTIONS                \
+        :                                \
+        : [l1]"r"(p_line1), [l2]"r"(p_line2), \
+          [y1]"r"(p_y1),  [y2]"r"(p_y2), \
+          [u]"r"(p_u),  [v]"r"(p_v)      \
+        : "ymm0", "ymm1", "ymm2", "ymm3", "ymm4"); \
+        p_line1 += 64; p_line2 += 64;    \
+        p_y1 += 32; p_y2 += 32;          \
+        p_u += 16; p_v += 16;            \
+    } while(0)
+
+#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+
+#define AVX2_INIT_ALIGNED "                                                   \n\
+vmovdqa     (%[y1]), %%ymm0  # Load 32 Y1                     ... y2  y1  y0  \n\
+vmovdqa     (%[y2]), %%ymm1  # Load 32 Y2                     ... Y2  Y1  Y0  \n\
+vmovdqa     (%[u]), %%xmm2   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqa     (%[v]), %%xmm3   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+"
+
+#define AVX2_INIT_UNALIGNED "                                                 \n\
+vmovdqu     (%[y1]), %%ymm0  # Load 32 Y1                     ... y2  y1  y0  \n\
+vmovdqu     (%[y2]), %%ymm1  # Load 32 Y2                     ... Y2  Y1  Y0  \n\
+vmovdqu     (%[u]), %%xmm2   # Load 16 Cb into lower half     ... u2  u1  u0  \n\
+vmovdqu     (%[v]), %%xmm3   # Load 16 Cr into lower half     ... v2  v1  v0  \n\
+prefetchnta (%[l1])          # Tell CPU not to cache output data              \n\
+prefetchnta (%[l2])          # Tell CPU not to cache output data              \n\
+"
+
+#define AVX2_YUV420_YUYV_ALIGNED "                                                  \n\
+vpunpcklbw %%ymm3, %%ymm2, %%ymm2  # Interleave u,v             ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm3  # Interleave (low) y1,uv     ... v0  y1  u0  y0  \n\
+vmovntdq   %%ymm3, (%[l1])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm4  # Interleave (high) y1,uv    ... v4 y17  u4 y16  \n\
+vmovntdq   %%ymm4, 32(%[l1])       # Store high YUYV                                \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm3  # Interleave (low) y2,uv     ... v0  Y1  u0  Y0  \n\
+vmovntdq   %%ymm3, (%[l2])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm4  # Interleave (high) y2,uv    ... v4 Y17  u4 Y16  \n\
+vmovntdq   %%ymm4, 32(%[l2])       # Store high YUYV                                \n\
+"
+
+#define AVX2_YUV420_YUYV_UNALIGNED "                                                \n\
+vpunpcklbw %%ymm3, %%ymm2, %%ymm2  # Interleave u,v             ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm3  # Interleave (low) y1,uv     ... v0  y1  u0  y0  \n\
+vmovdqu    %%ymm3, (%[l1])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm4  # Interleave (high) y1,uv    ... v4 y17  u4 y16  \n\
+vmovdqu    %%ymm4, 32(%[l1])       # Store high YUYV                                \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm3  # Interleave (low) y2,uv     ... v0  Y1  u0  Y0  \n\
+vmovdqu    %%ymm3, (%[l2])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm4  # Interleave (high) y2,uv    ... v4 Y17  u4 Y16  \n\
+vmovdqu    %%ymm4, 32(%[l2])       # Store high YUYV                                \n\
+"
+
+#define AVX2_YUV420_YVYU_ALIGNED "                                                  \n\
+vpunpcklbw %%ymm2, %%ymm3, %%ymm2  # Interleave v,u             ... u1  v1  u0  v0  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm3  # Interleave (low) y1,vu     ... u0  y1  v0  y0  \n\
+vmovntdq   %%ymm3, (%[l1])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm4  # Interleave (high) y1,vu    ... u4 y17  v4 y16  \n\
+vmovntdq   %%ymm4, 32(%[l1])       # Store high YUYV                                \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm3  # Interleave (low) y2,vu     ... u0  Y1  v0  Y0  \n\
+vmovntdq   %%ymm3, (%[l2])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm4  # Interleave (high) y2,vu    ... u4 Y17  v4 Y16  \n\
+vmovntdq   %%ymm4, 32(%[l2])       # Store high YUYV                                \n\
+"
+
+#define AVX2_YUV420_YVYU_UNALIGNED "                                                \n\
+vpunpcklbw %%ymm2, %%ymm3, %%ymm2  # Interleave v,u             ... u1  v1  u0  v0  \n\
+vpunpcklbw %%ymm2, %%ymm0, %%ymm3  # Interleave (low) y1,vu     ... u0  y1  v0  y0  \n\
+vmovdqu    %%ymm3, (%[l1])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm0, %%ymm4  # Interleave (high) y1,vu    ... u4 y17  v4 y16  \n\
+vmovdqu    %%ymm4, 32(%[l1])       # Store high YUYV                                \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm3  # Interleave (low) y2,vu     ... u0  Y1  v0  Y0  \n\
+vmovdqu    %%ymm3, (%[l2])         # Store low YUYV                                 \n\
+vpunpckhbw %%ymm2, %%ymm1, %%ymm4  # Interleave (high) y2,vu    ... u4 Y17  v4 Y16  \n\
+vmovdqu    %%ymm4, 32(%[l2])       # Store high YUYV                                \n\
+"
+
+#define AVX2_YUV420_UYVY_ALIGNED "                                                  \n\
+vpunpcklbw %%ymm3, %%ymm2, %%ymm2  # Interleave u,v             ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm0, %%ymm2, %%ymm3  # Interleave (low) uv,y1     ... y1  v0  y0  u0  \n\
+vmovntdq   %%ymm3, (%[l1])         # Store low UYVY                                 \n\
+vpunpckhbw %%ymm0, %%ymm2, %%ymm4  # Interleave (high) uv,y1   ... y17  v8 y16  u8  \n\
+vmovntdq   %%ymm4, 32(%[l1])       # Store high UYVY                                \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm3  # Interleave (low) uv,y2     ... Y1  v0  Y0  u0  \n\
+vmovntdq   %%ymm3, (%[l2])         # Store low UYVY                                 \n\
+vpunpckhbw %%ymm1, %%ymm2, %%ymm4  # Interleave (high) uv,y2   ... Y17  v8 Y16  u8  \n\
+vmovntdq   %%ymm4, 32(%[l2])       # Store high UYVY                                \n\
+"
+
+#define AVX2_YUV420_UYVY_UNALIGNED "                                                \n\
+vpunpcklbw %%ymm3, %%ymm2, %%ymm2  # Interleave u,v             ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm0, %%ymm2, %%ymm3  # Interleave (low) uv,y1     ... y1  v0  y0  u0  \n\
+vmovdqu    %%ymm3, (%[l1])         # Store low UYVY                                 \n\
+vpunpckhbw %%ymm0, %%ymm2, %%ymm4  # Interleave (high) uv,y1   ... y17  v8 y16  u8  \n\
+vmovdqu    %%ymm4, 32(%[l1])       # Store high UYVY                                \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm3  # Interleave (low) uv,y2     ... Y1  v0  Y0  u0  \n\
+vmovdqu    %%ymm3, (%[l2])         # Store low UYVY                                 \n\
+vpunpckhbw %%ymm1, %%ymm2, %%ymm4  # Interleave (high) uv,y2   ... Y17  v8 Y16  u8  \n\
+vmovdqu    %%ymm4, 32(%[l2])       # Store high UYVY                                \n\
+"
+
+#elif defined(HAVE_AVX2_INTRINSICS)
+
+/* AVX2 intrinsics */
+
+#include <immintrin.h>
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)            \
+    do {                                        \
+        __m256i ymm0, ymm1, ymm2, ymm3, ymm4;   \
+        AVX2_INSTRUCTIONS                       \
+        p_line1 += 64; p_line2 += 64;           \
+        p_y1 += 32; p_y2 += 32;                 \
+        p_u += 16; p_v += 16;                   \
+    } while(0)
+
+#define AVX2_END  _mm_sfence()
+
+#define AVX2_INIT_ALIGNED                       \
+    ymm0 = _mm256_load_si256((__m256i *)p_y1);  \
+    ymm1 = _mm256_load_si256((__m256i *)p_y2);  \
+    ymm2 = _mm256_inserti128_si256(ymm2, *((__m128i*)p_u), 0); \
+    ymm3 = _mm256_inserti128_si256(ymm3, *((__m128i*)p_v), 0);
+
+#define AVX2_INIT_UNALIGNED                     \
+    ymm0 = _mm256_loadu_si256((__m256i *)p_y1); \
+    ymm1 = _mm256_loadu_si256((__m256i *)p_y2); \
+    ymm2 = _mm256_inserti128_si256(ymm2, *((__m128i*)p_u), 0); \
+    ymm3 = _mm256_inserti128_si256(ymm3, *((__m128i*)p_v), 0); \
+    _mm_prefetch(p_line1, _MM_HINT_NTA);        \
+    _mm_prefetch(p_line2, _MM_HINT_NTA);
+
+#define AVX2_YUV420_YUYV_ALIGNED                       \
+    ymm2 = _mm256_unpacklo_epi8(ymm2, ymm3);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4);
+
+#define AVX2_YUV420_YUYV_UNALIGNED                     \
+    ymm2 = _mm256_unpacklo_epi8(ymm2, ymm3);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4);
+
+#define AVX2_YUV420_YVYU_ALIGNED                       \
+    ymm2 = _mm256_unpacklo_epi8(ymm3, ymm2);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4);
+
+#define AVX2_YUV420_YVYU_UNALIGNED                     \
+    ymm2 = _mm256_unpacklo_epi8(ymm3, ymm2);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm0, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm0, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm1, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm1, ymm2);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4);
+
+#define AVX2_YUV420_UYVY_ALIGNED                       \
+    ymm2 = _mm256_unpacklo_epi8(ymm2, ymm3);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm2, ymm0);           \
+    _mm256_stream_si256((__m128i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm2, ymm0);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm2, ymm1);           \
+    _mm256_stream_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm2, ymm1);           \
+    _mm256_stream_si256((__m256i*)(p_line1+32), ymm4);
+
+#define AVX2_YUV420_UYVY_UNALIGNED                     \
+    ymm2 = _mm256_unpacklo_epi8(ymm2, ymm3);           \
+    ymm3 = _mm256_unpacklo_epi8(ymm2, ymm0);           \
+    _mm256_storeu_si256((__m128i*)(p_line1), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm2, ymm0);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4); \
+    ymm3 = _mm256_unpacklo_epi8(ymm2, ymm1);           \
+    _mm256_storeu_si256((__m256i*)(p_line2), ymm3);    \
+    ymm4 = _mm256_unpackhi_epi8(ymm2, ymm1);           \
+    _mm256_storeu_si256((__m256i*)(p_line1+32), ymm4);
+
+#endif
+
 #endif
 
 /* Used in both accelerated and C modules */
diff --git a/modules/video_chroma/i422_yuy2.c b/modules/video_chroma/i422_yuy2.c
index 880be2cff7..1695c850d8 100644
--- a/modules/video_chroma/i422_yuy2.c
+++ b/modules/video_chroma/i422_yuy2.c
@@ -1,7 +1,7 @@
 /*****************************************************************************
  * i422_yuy2.c : Planar YUV 4:2:2 to Packed YUV conversion module for vlc
  *****************************************************************************
- * Copyright (C) 2000, 2001 VLC authors and VideoLAN
+ * Copyright (C) 2000, 2001, 2019 VLC authors and VideoLAN
  *
  * Authors: Samuel Hocevar <sam at zoy.org>
  *          Damien Fouilleul <damienf at videolan.org>
@@ -81,6 +81,11 @@ vlc_module_begin ()
     set_capability( "video converter", 120 )
 # define vlc_CPU_capable() vlc_CPU_SSE2()
 # define VLC_TARGET VLC_SSE
+#elif defined (MODULE_NAME_IS_i422_yuy2_avx2)
+    set_description( N_("AVX2 conversions from " SRC_FOURCC " to " DEST_FOURCC) )
+    set_capability( "video converter", 130 )
+# define vlc_CPU_capable() vlc_CPU_AVX2()
+# define VLC_TARGET VLC_AVX
 #endif
     set_callback( Activate )
 vlc_module_end ()
@@ -179,7 +184,59 @@ static void I422_YUY2( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
+#if defined (MODULE_NAME_IS_i422_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line|(intptr_t)p_y))) )
+    {
+        /* use AVX aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV422_YUYV_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_YUYV( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV422_YUYV_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_YUYV( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    AVX2_END;
+
+#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line|(intptr_t)p_y))) )
@@ -276,7 +333,59 @@ static void I422_YVYU( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
+#if defined (MODULE_NAME_IS_i422_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line|(intptr_t)p_y))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV422_YVYU_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_YVYU( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV422_YVYU_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_YVYU( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    AVX2_END;
+
+#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line|(intptr_t)p_y))) )
@@ -373,7 +482,59 @@ static void I422_UYVY( filter_t *p_filter, picture_t *p_source,
                                - p_dest->p->i_visible_pitch
                                - ( p_filter->fmt_out.video.i_x_offset * 2 );
 
-#if defined (MODULE_NAME_IS_i422_yuy2_sse2)
+#if defined (MODULE_NAME_IS_i422_yuy2_avx2)
+
+    /* AVX2 aligned store/load can require 32-byte alignment */
+
+    if( 0 == (31 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
+        ((intptr_t)p_line|(intptr_t)p_y))) )
+    {
+        /* use AVX2 aligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_ALIGNED
+                    AVX2_YUV422_UYVY_ALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    else {
+        /* use AVX2 unaligned fetch and store */
+        for( i_y = (p_filter->fmt_in.video.i_y_offset + p_filter->fmt_in.video.i_visible_height) ; i_y-- ; )
+        {
+            for( i_x = (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) / 32 ; i_x-- ; )
+            {
+                AVX2_CALL(
+                    AVX2_INIT_UNALIGNED
+                    AVX2_YUV422_UYVY_UNALIGNED
+                );
+            }
+            for( i_x = ( (p_filter->fmt_in.video.i_x_offset + p_filter->fmt_in.video.i_visible_width) % 32 ) / 2; i_x-- ; )
+            {
+                C_YUV422_UYVY( p_line, p_y, p_u, p_v );
+            }
+            p_y += i_source_margin;
+            p_u += i_source_margin_c;
+            p_v += i_source_margin_c;
+            p_line += i_dest_margin;
+        }
+    }
+    AVX2_END;
+
+#elif defined (MODULE_NAME_IS_i422_yuy2_sse2)
+
+    /* SSE2 aligned store/load is faster, requires 16-byte alignment */
 
     if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch|
         ((intptr_t)p_line|(intptr_t)p_y))) )
diff --git a/modules/video_chroma/i422_yuy2.h b/modules/video_chroma/i422_yuy2.h
index 424d52280d..8135fe1853 100644
--- a/modules/video_chroma/i422_yuy2.h
+++ b/modules/video_chroma/i422_yuy2.h
@@ -1,10 +1,11 @@
 /*****************************************************************************
  * i422_yuy2.h : YUV to YUV conversion module for vlc
  *****************************************************************************
- * Copyright (C) 2002 VLC authors and VideoLAN
+ * Copyright (C) 2002, 2019 VLC authors and VideoLAN
  *
  * Authors: Samuel Hocevar <sam at zoy.org>
  *          Damien Fouilleul <damienf at videolan.org>
+ *          Lyndon Brown <jnqnfe at gmail.com>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -309,6 +310,159 @@ movdqu    %%xmm1, 16(%0)  # Store high UYVY                             \n\
 
 #endif
 
+#elif defined( MODULE_NAME_IS_i422_yuy2_avx2 )
+
+#if defined(CAN_COMPILE_AVX2)
+
+/* AVX2 assembly */
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)        \
+    do {                                    \
+    __asm__ __volatile__(                   \
+        ".p2align 3 \n\t"                   \
+        AVX2_INSTRUCTIONS                   \
+        :                                   \
+        : [l]"r"(p_line), [y]"r"(p_y),      \
+          [u]"r"(p_u), [v]"r"(p_v)          \
+        : "ymm0", "ymm1", "ymm2" );         \
+        p_line += 64; p_y += 32;            \
+        p_u += 16; p_v += 16;               \
+    } while(0)
+
+#define AVX2_END  __asm__ __volatile__ ( "sfence" ::: "memory" )
+
+#define AVX2_INIT_ALIGNED "                                                    \n\
+vmovdqa      (%[y]), %%ymm0  # Load 32 Y                      ...  y2  y1  y0  \n\
+vmovdqa      (%[u]), %%xmm1  # Load 16 Cb into lower half     ...  u2  u1  u0  \n\
+vmovdqa      (%[v]), %%xmm2  # Load 16 Cr into lower half     ...  v2  v1  v0  \n\
+"
+
+#define AVX2_INIT_UNALIGNED "                                                  \n\
+vmovdqu      (%[y]), %%ymm0  # Load 32 Y                      ...  y2  y1  y0  \n\
+vmovdqu      (%[u]), %%xmm1  # Load 16 Cb into lower half     ...  u2  u1  u0  \n\
+vmovdqu      (%[v]), %%xmm2  # Load 16 Cr into lower half     ...  v2  v1  v0  \n\
+prefetchnta  (%[l])          # Tell CPU not to cache output data               \n\
+"
+
+#define AVX2_YUV422_YUYV_ALIGNED "                                                   \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm1  # Interleave u,v              ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm2  # Interleave (low) y,uv       ... v0  y1  u0  y0  \n\
+vmovntdq   %%ymm2, (%[l])          # Store low YUYV                                  \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm1  # Interleave (high) y,uv      ... v8 y17  u8 y16  \n\
+vmovntdq   %%ymm1, 32(%[l])        # Store high YUYV                                 \n\
+"
+
+#define AVX2_YUV422_YUYV_UNALIGNED "                                                 \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm1  # Interleave u,v              ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm2  # Interleave (low) y,uv       ... v0  y1  u0  y0  \n\
+vmovdqu    %%ymm2, (%[l])          # Store low YUYV                                  \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm1  # Interleave (high) y,uv      ... v8 y17  u8 y16  \n\
+vmovdqu    %%ymm1, 32(%[l])        # Store high YUYV                                 \n\
+"
+
+#define AVX2_YUV422_YVYU_ALIGNED "                                                   \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm1  # Interleave v,u              ... u1  v1  u0  v0  \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm2  # Interleave (low) y,vu       ... u0  y1  v0  y0  \n\
+vmovntdq   %%ymm2, (%[l])          # Store low YUYV                                  \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm1  # Interleave (high) y,vu      ... u8 y17  v8 y16  \n\
+vmovntdq   %%ymm1, 32(%[l])        # Store high YUYV                                 \n\
+"
+
+#define AVX2_YUV422_YVYU_UNALIGNED "                                                 \n\
+vpunpcklbw %%ymm1, %%ymm2, %%ymm1  # Interleave v,u              ... u1  v1  u0  v0  \n\
+vpunpcklbw %%ymm1, %%ymm0, %%ymm2  # Interleave (low) y,vu       ... u0  y1  v0  y0  \n\
+vmovdqu    %%ymm2, (%[l])          # Store low YUYV                                  \n\
+vpunpckhbw %%ymm1, %%ymm0, %%ymm1  # Interleave (high) y,vu      ... u8 y17  v8 y16  \n\
+vmovdqu    %%ymm1, 32(%[l])        # Store high YUYV                                 \n\
+"
+
+#define AVX2_YUV422_UYVY_ALIGNED "                                                   \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm1  # Interleave u,v              ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm0, %%ymm1, %%ymm2  # Interleave (low) uv,y       ... y1  v0  y0  u0  \n\
+vmovntdq   %%ymm2, (%[l])          # Store low UYVY                                  \n\
+vpunpckhbw %%ymm0, %%ymm1, %%ymm1  # Interleave (high) uv,y     ... y17  v8 y16  u8  \n\
+vmovntdq   %%ymm1, 32(%[l])        # Store high UYVY                                 \n\
+"
+
+#define AVX2_YUV422_UYVY_UNALIGNED "                                                 \n\
+vpunpcklbw %%ymm2, %%ymm1, %%ymm1  # Interleave u,v              ... v1  u1  v0  u0  \n\
+vpunpcklbw %%ymm0, %%ymm1, %%ymm2  # Interleave (low) uv,y       ... y1  v0  y0  u0  \n\
+vmovdqu    %%ymm2, (%[l])          # Store low UYVY                                  \n\
+vpunpckhbw %%ymm0, %%ymm1, %%ymm1  # Interleave (high) uv,y     ... y17  v8 y16  u8  \n\
+vmovdqu    %%ymm1, 32(%[l])        # Store high UYVY                                 \n\
+"
+
+#elif defined(HAVE_AVX2_INTRINSICS)
+
+/* AVX2 intrinsics */
+
+#include <immintrin.h>
+
+#define AVX2_CALL(AVX2_INSTRUCTIONS)    \
+    do {                                \
+        __m256i ymm0, ymm1, ymm2;       \
+        AVX2_INSTRUCTIONS               \
+        p_line += 64; p_y += 32;        \
+        p_u += 16; p_v += 16;           \
+    } while(0)
+
+#define AVX2_END  _mm_sfence()
+
+#define AVX2_INIT_ALIGNED                      \
+    ymm0 = _mm256_load_si256((__m256i *)p_y);  \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_u), 0); \
+    ymm2 = _mm256_inserti128_si256(ymm2, *((__m128i*)p_v), 0);
+
+#define AVX2_INIT_UNALIGNED                    \
+    ymm0 = _mm256_loadu_si256((__m256i *)p_y); \
+    ymm1 = _mm256_inserti128_si256(ymm1, *((__m128i*)p_u), 0); \
+    ymm2 = _mm256_inserti128_si256(ymm2, *((__m128i*)p_v), 0); \
+    _mm_prefetch(p_line, _MM_HINT_NTA);
+
+#define AVX2_YUV422_YUYV_ALIGNED                     \
+    ymm1 = _mm256_unpacklo_epi8(ymm1, ymm2);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm0, ymm1);         \
+    _mm256_stream_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm0, ymm1);         \
+    _mm256_stream_si256((__m256i*)(p_line+32), ymm1);
+
+#define AVX2_YUV422_YUYV_UNALIGNED                   \
+    ymm1 = _mm256_unpacklo_epi8(ymm1, ymm2);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm0, ymm1);         \
+    _mm256_storeu_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm0, ymm1);         \
+    _mm256_storeu_si256((__m256i*)(p_line+32), ymm1);
+
+#define AVX2_YUV422_YVYU_ALIGNED                     \
+    ymm1 = _mm256_unpacklo_epi8(ymm2, ymm1);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm0, ymm1);         \
+    _mm256_stream_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm0, ymm1);         \
+    _mm256_stream_si256((__m256i*)(p_line+32), ymm1);
+
+#define AVX2_YUV422_YVYU_UNALIGNED                   \
+    ymm1 = _mm256_unpacklo_epi8(ymm2, ymm1);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm0, ymm1);         \
+    _mm256_storeu_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm0, ymm1);         \
+    _mm256_storeu_si256((__m256i*)(p_line+32), ymm1);
+
+#define AVX2_YUV422_UYVY_ALIGNED                     \
+    ymm1 = _mm256_unpacklo_epi8(ymm1, ymm2);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);         \
+    _mm256_stream_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);         \
+    _mm256_stream_si256((__m256i*)(p_line+32), ymm1);
+
+#define AVX2_YUV422_UYVY_UNALIGNED                   \
+    ymm1 = _mm256_unpacklo_epi8(ymm1, ymm2);         \
+    ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0);         \
+    _mm256_storeu_si256((__m256i*)(p_line), ymm2);   \
+    ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0);         \
+    _mm256_storeu_si256((__m256i*)(p_line+32), ymm1);
+
+#endif
+
 #endif
 
 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v )                              \



More information about the vlc-devel mailing list