[vlc-devel] [PATCH 1/1] deinterlace: x86: convert merge inline asm to x86inc/yasm

Mon Nov 28 10:28:26 CET 2016

Adds an avx2 merge implementation. The AVX2 version softly depends onto
the 32-byte alignment patch since it is unfortunately slower than the
SSE2 version if the buffers have only 16-byte alignment.

On a full HD 4:2:0 video I get following cycle counts on haswell:
Merge8BitGeneric:        >4000
vlcpriv_merge_8bit_sse2:   425 (16 and 32-byte alignment)
vlcpriv_merge_8bit_avx2:   297 (32-byte alignment)
vlcpriv_merge_8bit_avx2:   554 (16-byte alignment)
---
 modules/video_filter/Makefile.am                   |   1 +
 modules/video_filter/deinterlace/common.h          |   1 +
 modules/video_filter/deinterlace/deinterlace.c     |  43 ++++---
 .../video_filter/deinterlace/deinterlace_x86.asm   |   4 +
 modules/video_filter/deinterlace/merge.c           | 132 ---------------------
 modules/video_filter/deinterlace/merge.h           |  75 ++++++------
 modules/video_filter/deinterlace/merge_x86.asm     |  67 +++++++++++
 7 files changed, 134 insertions(+), 189 deletions(-)
 create mode 100644 modules/video_filter/deinterlace/merge_x86.asm

diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
index 56b141f93b..35f908a37e 100644
--- a/modules/video_filter/Makefile.am
+++ b/modules/video_filter/Makefile.am
@@ -123,6 +123,7 @@ libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS) -O2
 if HAVE_YASM
 libdeinterlace_plugin_la_CFLAGS += -DHAVE_YASM
 libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/deinterlace_x86.asm
+libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_x86.asm
 libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/yadif_x86.asm
 endif
 if HAVE_NEON
diff --git a/modules/video_filter/deinterlace/common.h b/modules/video_filter/deinterlace/common.h
index 4f6c5462b2..cc980714ea 100644
--- a/modules/video_filter/deinterlace/common.h
+++ b/modules/video_filter/deinterlace/common.h
@@ -37,6 +37,7 @@
 
 #ifdef HAVE_YASM
 void vlcpriv_emms_ext_asm(void);
+void vlcpriv_femms_ext_asm(void);
 #endif
 
 #endif
diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
index 9123addab7..1cf709daa8 100644
--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -43,6 +43,7 @@
 #include <vlc_cpu.h>
 #include <vlc_mouse.h>
 
+#include "common.h"
 #include "deinterlace.h"
 #include "helpers.h"
 #include "merge.h"
@@ -669,28 +670,36 @@ notsupp:
         p_sys->pf_merge = MergeAltivec;
     else
 #endif
-#if defined(CAN_COMPILE_SSE2)
-    if( vlc_CPU_SSE2() )
+#if defined(HAVE_YASM)
+    if( vlc_CPU_AVX2() )
     {
-        p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
-        p_sys->pf_end_merge = EndMMX;
+        if (pixel_size == 1)
+            p_sys->pf_merge = vlcpriv_merge_8bit_avx2;
+        else
+            p_sys->pf_merge = vlcpriv_merge_16bit_avx2;
     }
-    else
-#endif
-#if defined(CAN_COMPILE_MMXEXT)
-    if( pixel_size == 1 && vlc_CPU_MMXEXT() )
+    else if( vlc_CPU_SSE2() )
     {
-        p_sys->pf_merge = MergeMMXEXT;
-        p_sys->pf_end_merge = EndMMX;
+        if (pixel_size == 1)
+            p_sys->pf_merge = vlcpriv_merge_8bit_sse2;
+        else
+            p_sys->pf_merge = vlcpriv_merge_16bit_sse2;
     }
-    else
-#endif
-#if defined(CAN_COMPILE_3DNOW)
-    if( pixel_size == 1 && vlc_CPU_3dNOW() )
+#if defined(__i386__)
+    else if( pixel_size == 1 && vlc_CPU_MMXEXT() )
     {
-        p_sys->pf_merge = Merge3DNow;
-        p_sys->pf_end_merge = End3DNow;
+        if (pixel_size == 1)
+            p_sys->pf_merge = vlcpriv_merge_8bit_mmx2;
+        else
+            p_sys->pf_merge = vlcpriv_merge_16bit_mmx2;
+        p_sys->pf_end_merge = vlcpriv_emms_ext_asm;
     }
+    else if( pixel_size == 1 && vlc_CPU_3dNOW() )
+    {
+        p_sys->pf_merge = vlcpriv_merge_8bit_3dnow;
+        p_sys->pf_end_merge = vlcpriv_femms_ext_asm;
+    }
+#endif
     else
 #endif
 #if defined(CAN_COMPILE_ARM)
@@ -708,7 +717,7 @@ notsupp:
 #endif
     {
         p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
         p_sys->pf_end_merge = NULL;
 #endif
     }
diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index 60ce829b0d..322c268138 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -32,3 +32,7 @@ SECTION .text
 cglobal emms_ext_asm, 0,0,0
         emms
         RET
+
+cglobal femms_ext_asm, 0,0,0
+        femms
+        RET
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 94cdd775ac..ec34f47bc6 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -34,10 +34,6 @@
 #include <vlc_cpu.h>
 #include "merge.h"
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #ifdef HAVE_ALTIVEC_H
 #   include <altivec.h>
 #endif
@@ -68,116 +64,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
 }
 
-#if defined(CAN_COMPILE_MMXEXT)
-VLC_MMX
-void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                  size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes >= 8; i_bytes -= 8 )
-    {
-        __asm__  __volatile__( "movq %2,%%mm1;"
-                               "pavgb %1, %%mm1;"
-                               "movq %%mm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "mm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-VLC_MMX
-void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                 size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes >= 8; i_bytes -= 8 )
-    {
-        __asm__  __volatile__( "movq %2,%%mm1;"
-                               "pavgusb %1, %%mm1;"
-                               "movq %%mm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "mm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-#if defined(CAN_COMPILE_SSE)
-VLC_SSE
-void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                    size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes > 0 && ((uintptr_t)p_s1 & 15); i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-
-    for( ; i_bytes >= 16; i_bytes -= 16 )
-    {
-        __asm__  __volatile__( "movdqu %2,%%xmm1;"
-                               "pavgb %1, %%xmm1;"
-                               "movdqu %%xmm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "xmm1" );
-        p_dest += 16;
-        p_s1 += 16;
-        p_s2 += 16;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-
-VLC_SSE
-void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                     size_t i_bytes )
-{
-    uint16_t *p_dest = _p_dest;
-    const uint16_t *p_s1 = _p_s1;
-    const uint16_t *p_s2 = _p_s2;
-
-    size_t i_words = i_bytes / 2;
-    for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-
-    for( ; i_words >= 8; i_words -= 8 )
-    {
-        __asm__  __volatile__( "movdqu %2,%%xmm1;"
-                               "pavgw %1, %%xmm1;"
-                               "movdqu %%xmm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "xmm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_words > 0; i_words-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-
-#endif
-
 #ifdef CAN_COMPILE_C_ALTIVEC
 void MergeAltivec( void *_p_dest, const void *_p_s1,
                    const void *_p_s2, size_t i_bytes )
@@ -245,21 +131,3 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
 }
 #endif
-
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-void EndMMX( void )
-{
-    __asm__ __volatile__( "emms" :: );
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-void End3DNow( void )
-{
-    __asm__ __volatile__( "femms" :: );
-}
-#endif
diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
index 74b5ab57ff..2f79d053e3 100644
--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -64,7 +64,7 @@
  * EndMerge() macro, which must be called after the merge is
  * finished, if the Merge() macro was used to perform the merge.
  */
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
 # define EndMerge() \
     if(p_filter->p_sys->pf_end_merge) (p_filter->p_sys->pf_end_merge)()
 #else
@@ -113,7 +113,7 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
 void MergeAltivec ( void *, const void *, const void *, size_t );
 #endif
 
-#if defined(CAN_COMPILE_MMXEXT)
+#if defined(HAVE_YASM)
 /**
  * MMXEXT routine to blend pixels from two picture lines.
  *
@@ -122,10 +122,18 @@ void MergeAltivec ( void *, const void *, const void *, size_t );
  * @param _p_s2 Source line B
  * @param i_bytes Number of bytes to merge
  */
-void MergeMMXEXT  ( void *, const void *, const void *, size_t );
-#endif
+void vlcpriv_merge_8bit_mmx2( void *, const void *, const void *, size_t );
+/**
+ * MMXEXT routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_16bit_mmx2( void *, const void *, const void *, size_t );
+
 
-#if defined(CAN_COMPILE_3DNOW)
 /**
  * 3DNow routine to blend pixels from two picture lines.
  *
@@ -134,10 +142,8 @@ void MergeMMXEXT  ( void *, const void *, const void *, size_t );
  * @param _p_s2 Source line B
  * @param i_bytes Number of bytes to merge
  */
-void Merge3DNow   ( void *, const void *, const void *, size_t );
-#endif
+void vlcpriv_merge_8bit_3dnow( void *, const void *, const void *, size_t );
 
-#if defined(CAN_COMPILE_SSE)
 /**
  * SSE2 routine to blend pixels from two picture lines.
  *
@@ -146,7 +152,7 @@ void Merge3DNow   ( void *, const void *, const void *, size_t );
  * @param _p_s2 Source line B
  * @param i_bytes Number of bytes to merge
  */
-void Merge8BitSSE2( void *, const void *, const void *, size_t );
+void vlcpriv_merge_8bit_sse2( void *, const void *, const void *, size_t );
 /**
  * SSE2 routine to blend pixels from two picture lines.
  *
@@ -155,7 +161,26 @@ void Merge8BitSSE2( void *, const void *, const void *, size_t );
  * @param _p_s2 Source line B
  * @param i_bytes Number of bytes to merge
  */
-void Merge16BitSSE2( void *, const void *, const void *, size_t );
+void vlcpriv_merge_16bit_sse2( void *, const void *, const void *, size_t );
+
+/**
+ * AVX2 routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_8bit_avx2( void *, const void *, const void *, size_t );
+/**
+ * AVX2 routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_16bit_avx2( void *, const void *, const void *, size_t );
 #endif
 
 #if defined(CAN_COMPILE_ARM)
@@ -181,34 +206,4 @@ void merge16_arm64_neon (void *, const void *, const void *, size_t);
 
 #endif
 
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-/**
- * MMX merge finalization routine.
- *
- * Must be called after an MMX merge is finished.
- * This exits MMX mode (by executing the "emms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void EndMMX       ( void );
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-/**
- * 3DNow merge finalization routine.
- *
- * Must be called after a 3DNow merge is finished.
- * This exits 3DNow mode (by executing the "femms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void End3DNow     ( void );
-#endif
-
 #endif
diff --git a/modules/video_filter/deinterlace/merge_x86.asm b/modules/video_filter/deinterlace/merge_x86.asm
new file mode 100644
index 0000000000..630f9bda39
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_x86.asm
@@ -0,0 +1,67 @@
+;****************************************************************************
+;* merge_x86.ac : Merge (line blending) routines for the VLC deinterlacer
+;*****************************************************************************
+;* Copyright (C) 2011 VLC authors and VideoLAN
+;* $Id: 94cdd775ac70a548f23b3efbca65d6259d77ca39 $
+;*
+;* Author: Sigmund Augdal Helberg <sigmunau at videolan.org> (MMXEXT, 3DNow, SSE2)
+;*         Janne Grunau <janne-vlc at jannau.net> (AVX2 and x86inc.asm adaptation)
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************/
+
+%include "x86util.asm"
+
+SECTION .text
+
+%macro MERGE 2
+cglobal merge_%1bit, 4,4,2, 0, dest, src1, src2, bytes
+%if mmsize > 16
+        test            bytesq, 16
+        je              .loop
+        movu            xm0, [src1q + bytesq - 16]
+        movu            xm1, [src2q + bytesq - 16]
+        %2              xm0, xm1
+        movu            [destq + bytesq - 16], xm0
+        sub             bytesq, 16
+        je              .end
+%endif
+.loop:
+        sub             bytesq, mmsize
+        movu            m0, [src1q + bytesq]
+        movu            m1, [src2q + bytesq]
+        %2              m0, m1
+        movu            [destq + bytesq], m0
+        jg              .loop
+.end:
+        REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx2
+MERGE  8, pavgb
+MERGE 16, pavgw
+
+INIT_MMX 3dnow
+MERGE  8, pavgusb
+%endif
+
+INIT_XMM sse2
+MERGE  8, pavgb
+MERGE 16, pavgw
+
+INIT_YMM avx2
+MERGE  8, pavgb
+MERGE 16, pavgw
-- 
2.11.0.rc2