[vlc-devel] [PATCH 1/1] deinterlace: x86: convert merge inline asm to x86inc/yasm
Janne Grunau
janne-vlc at jannau.net
Mon Nov 28 10:28:26 CET 2016
Adds an avx2 merge implementation. The AVX2 version softly depends onto
the 32-byte alignment patch since it is unfortunately slower than the
SSE2 version if the buffers have only 16-byte alignment.
On a full HD 4:2:0 video I get following cycle counts on haswell:
Merge8BitGeneric: >4000
vlcpriv_merge_8bit_sse2: 425 (16 and 32-byte alignment)
vlcpriv_merge_8bit_avx2: 297 (32-byte alignment)
vlcpriv_merge_8bit_avx2: 554 (16-byte alignment)
---
modules/video_filter/Makefile.am | 1 +
modules/video_filter/deinterlace/common.h | 1 +
modules/video_filter/deinterlace/deinterlace.c | 43 ++++---
.../video_filter/deinterlace/deinterlace_x86.asm | 4 +
modules/video_filter/deinterlace/merge.c | 132 ---------------------
modules/video_filter/deinterlace/merge.h | 75 ++++++------
modules/video_filter/deinterlace/merge_x86.asm | 67 +++++++++++
7 files changed, 134 insertions(+), 189 deletions(-)
create mode 100644 modules/video_filter/deinterlace/merge_x86.asm
diff --git a/modules/video_filter/Makefile.am b/modules/video_filter/Makefile.am
index 56b141f93b..35f908a37e 100644
--- a/modules/video_filter/Makefile.am
+++ b/modules/video_filter/Makefile.am
@@ -123,6 +123,7 @@ libdeinterlace_plugin_la_CFLAGS = $(AM_CFLAGS) -O2
if HAVE_YASM
libdeinterlace_plugin_la_CFLAGS += -DHAVE_YASM
libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/deinterlace_x86.asm
+libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/merge_x86.asm
libdeinterlace_plugin_la_SOURCES += video_filter/deinterlace/yadif_x86.asm
endif
if HAVE_NEON
diff --git a/modules/video_filter/deinterlace/common.h b/modules/video_filter/deinterlace/common.h
index 4f6c5462b2..cc980714ea 100644
--- a/modules/video_filter/deinterlace/common.h
+++ b/modules/video_filter/deinterlace/common.h
@@ -37,6 +37,7 @@
#ifdef HAVE_YASM
void vlcpriv_emms_ext_asm(void);
+void vlcpriv_femms_ext_asm(void);
#endif
#endif
diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
index 9123addab7..1cf709daa8 100644
--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -43,6 +43,7 @@
#include <vlc_cpu.h>
#include <vlc_mouse.h>
+#include "common.h"
#include "deinterlace.h"
#include "helpers.h"
#include "merge.h"
@@ -669,28 +670,36 @@ notsupp:
p_sys->pf_merge = MergeAltivec;
else
#endif
-#if defined(CAN_COMPILE_SSE2)
- if( vlc_CPU_SSE2() )
+#if defined(HAVE_YASM)
+ if( vlc_CPU_AVX2() )
{
- p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
- p_sys->pf_end_merge = EndMMX;
+ if (pixel_size == 1)
+ p_sys->pf_merge = vlcpriv_merge_8bit_avx2;
+ else
+ p_sys->pf_merge = vlcpriv_merge_16bit_avx2;
}
- else
-#endif
-#if defined(CAN_COMPILE_MMXEXT)
- if( pixel_size == 1 && vlc_CPU_MMXEXT() )
+ else if( vlc_CPU_SSE2() )
{
- p_sys->pf_merge = MergeMMXEXT;
- p_sys->pf_end_merge = EndMMX;
+ if (pixel_size == 1)
+ p_sys->pf_merge = vlcpriv_merge_8bit_sse2;
+ else
+ p_sys->pf_merge = vlcpriv_merge_16bit_sse2;
}
- else
-#endif
-#if defined(CAN_COMPILE_3DNOW)
- if( pixel_size == 1 && vlc_CPU_3dNOW() )
+#if defined(__i386__)
+ else if( pixel_size == 1 && vlc_CPU_MMXEXT() )
{
- p_sys->pf_merge = Merge3DNow;
- p_sys->pf_end_merge = End3DNow;
+ if (pixel_size == 1)
+ p_sys->pf_merge = vlcpriv_merge_8bit_mmx2;
+ else
+ p_sys->pf_merge = vlcpriv_merge_16bit_mmx2;
+ p_sys->pf_end_merge = vlcpriv_emms_ext_asm;
}
+ else if( pixel_size == 1 && vlc_CPU_3dNOW() )
+ {
+ p_sys->pf_merge = vlcpriv_merge_8bit_3dnow;
+ p_sys->pf_end_merge = vlcpriv_femms_ext_asm;
+ }
+#endif
else
#endif
#if defined(CAN_COMPILE_ARM)
@@ -708,7 +717,7 @@ notsupp:
#endif
{
p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
p_sys->pf_end_merge = NULL;
#endif
}
diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index 60ce829b0d..322c268138 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -32,3 +32,7 @@ SECTION .text
cglobal emms_ext_asm, 0,0,0
emms
RET
+
+cglobal femms_ext_asm, 0,0,0
+ femms
+ RET
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 94cdd775ac..ec34f47bc6 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -34,10 +34,6 @@
#include <vlc_cpu.h>
#include "merge.h"
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#ifdef HAVE_ALTIVEC_H
# include <altivec.h>
#endif
@@ -68,116 +64,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
-#if defined(CAN_COMPILE_MMXEXT)
-VLC_MMX
-void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
- size_t i_bytes )
-{
- uint8_t *p_dest = _p_dest;
- const uint8_t *p_s1 = _p_s1;
- const uint8_t *p_s2 = _p_s2;
-
- for( ; i_bytes >= 8; i_bytes -= 8 )
- {
- __asm__ __volatile__( "movq %2,%%mm1;"
- "pavgb %1, %%mm1;"
- "movq %%mm1, %0" :"=m" (*p_dest):
- "m" (*p_s1),
- "m" (*p_s2) : "mm1" );
- p_dest += 8;
- p_s1 += 8;
- p_s2 += 8;
- }
-
- for( ; i_bytes > 0; i_bytes-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-VLC_MMX
-void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
- size_t i_bytes )
-{
- uint8_t *p_dest = _p_dest;
- const uint8_t *p_s1 = _p_s1;
- const uint8_t *p_s2 = _p_s2;
-
- for( ; i_bytes >= 8; i_bytes -= 8 )
- {
- __asm__ __volatile__( "movq %2,%%mm1;"
- "pavgusb %1, %%mm1;"
- "movq %%mm1, %0" :"=m" (*p_dest):
- "m" (*p_s1),
- "m" (*p_s2) : "mm1" );
- p_dest += 8;
- p_s1 += 8;
- p_s2 += 8;
- }
-
- for( ; i_bytes > 0; i_bytes-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-#if defined(CAN_COMPILE_SSE)
-VLC_SSE
-void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
- size_t i_bytes )
-{
- uint8_t *p_dest = _p_dest;
- const uint8_t *p_s1 = _p_s1;
- const uint8_t *p_s2 = _p_s2;
-
- for( ; i_bytes > 0 && ((uintptr_t)p_s1 & 15); i_bytes-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-
- for( ; i_bytes >= 16; i_bytes -= 16 )
- {
- __asm__ __volatile__( "movdqu %2,%%xmm1;"
- "pavgb %1, %%xmm1;"
- "movdqu %%xmm1, %0" :"=m" (*p_dest):
- "m" (*p_s1),
- "m" (*p_s2) : "xmm1" );
- p_dest += 16;
- p_s1 += 16;
- p_s2 += 16;
- }
-
- for( ; i_bytes > 0; i_bytes-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-
-VLC_SSE
-void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
- size_t i_bytes )
-{
- uint16_t *p_dest = _p_dest;
- const uint16_t *p_s1 = _p_s1;
- const uint16_t *p_s2 = _p_s2;
-
- size_t i_words = i_bytes / 2;
- for( ; i_words > 0 && ((uintptr_t)p_s1 & 15); i_words-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-
- for( ; i_words >= 8; i_words -= 8 )
- {
- __asm__ __volatile__( "movdqu %2,%%xmm1;"
- "pavgw %1, %%xmm1;"
- "movdqu %%xmm1, %0" :"=m" (*p_dest):
- "m" (*p_s1),
- "m" (*p_s2) : "xmm1" );
- p_dest += 8;
- p_s1 += 8;
- p_s2 += 8;
- }
-
- for( ; i_words > 0; i_words-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-
-#endif
-
#ifdef CAN_COMPILE_C_ALTIVEC
void MergeAltivec( void *_p_dest, const void *_p_s1,
const void *_p_s2, size_t i_bytes )
@@ -245,21 +131,3 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
#endif
-
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-void EndMMX( void )
-{
- __asm__ __volatile__( "emms" :: );
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-void End3DNow( void )
-{
- __asm__ __volatile__( "femms" :: );
-}
-#endif
diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
index 74b5ab57ff..2f79d053e3 100644
--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -64,7 +64,7 @@
* EndMerge() macro, which must be called after the merge is
* finished, if the Merge() macro was used to perform the merge.
*/
-#if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__)
# define EndMerge() \
if(p_filter->p_sys->pf_end_merge) (p_filter->p_sys->pf_end_merge)()
#else
@@ -113,7 +113,7 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
void MergeAltivec ( void *, const void *, const void *, size_t );
#endif
-#if defined(CAN_COMPILE_MMXEXT)
+#if defined(HAVE_YASM)
/**
* MMXEXT routine to blend pixels from two picture lines.
*
@@ -122,10 +122,18 @@ void MergeAltivec ( void *, const void *, const void *, size_t );
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
-void MergeMMXEXT ( void *, const void *, const void *, size_t );
-#endif
+void vlcpriv_merge_8bit_mmx2( void *, const void *, const void *, size_t );
+/**
+ * MMXEXT routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_16bit_mmx2( void *, const void *, const void *, size_t );
+
-#if defined(CAN_COMPILE_3DNOW)
/**
* 3DNow routine to blend pixels from two picture lines.
*
@@ -134,10 +142,8 @@ void MergeMMXEXT ( void *, const void *, const void *, size_t );
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
-void Merge3DNow ( void *, const void *, const void *, size_t );
-#endif
+void vlcpriv_merge_8bit_3dnow( void *, const void *, const void *, size_t );
-#if defined(CAN_COMPILE_SSE)
/**
* SSE2 routine to blend pixels from two picture lines.
*
@@ -146,7 +152,7 @@ void Merge3DNow ( void *, const void *, const void *, size_t );
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
-void Merge8BitSSE2( void *, const void *, const void *, size_t );
+void vlcpriv_merge_8bit_sse2( void *, const void *, const void *, size_t );
/**
* SSE2 routine to blend pixels from two picture lines.
*
@@ -155,7 +161,26 @@ void Merge8BitSSE2( void *, const void *, const void *, size_t );
* @param _p_s2 Source line B
* @param i_bytes Number of bytes to merge
*/
-void Merge16BitSSE2( void *, const void *, const void *, size_t );
+void vlcpriv_merge_16bit_sse2( void *, const void *, const void *, size_t );
+
+/**
+ * AVX2 routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_8bit_avx2( void *, const void *, const void *, size_t );
+/**
+ * AVX2 routine to blend pixels from two picture lines.
+ *
+ * @param _p_dest Target
+ * @param _p_s1 Source line A
+ * @param _p_s2 Source line B
+ * @param i_bytes Number of bytes to merge
+ */
+void vlcpriv_merge_16bit_avx2( void *, const void *, const void *, size_t );
#endif
#if defined(CAN_COMPILE_ARM)
@@ -181,34 +206,4 @@ void merge16_arm64_neon (void *, const void *, const void *, size_t);
#endif
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-/**
- * MMX merge finalization routine.
- *
- * Must be called after an MMX merge is finished.
- * This exits MMX mode (by executing the "emms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void EndMMX ( void );
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-/**
- * 3DNow merge finalization routine.
- *
- * Must be called after a 3DNow merge is finished.
- * This exits 3DNow mode (by executing the "femms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void End3DNow ( void );
-#endif
-
#endif
diff --git a/modules/video_filter/deinterlace/merge_x86.asm b/modules/video_filter/deinterlace/merge_x86.asm
new file mode 100644
index 0000000000..630f9bda39
--- /dev/null
+++ b/modules/video_filter/deinterlace/merge_x86.asm
@@ -0,0 +1,67 @@
+;****************************************************************************
+;* merge_x86.ac : Merge (line blending) routines for the VLC deinterlacer
+;*****************************************************************************
+;* Copyright (C) 2011 VLC authors and VideoLAN
+;* $Id: 94cdd775ac70a548f23b3efbca65d6259d77ca39 $
+;*
+;* Author: Sigmund Augdal Helberg <sigmunau at videolan.org> (MMXEXT, 3DNow, SSE2)
+;* Janne Grunau <janne-vlc at jannau.net> (AVX2 and x86inc.asm adaptation)
+;*
+;* This program is free software; you can redistribute it and/or modify it
+;* under the terms of the GNU Lesser General Public License as published by
+;* the Free Software Foundation; either version 2.1 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public License
+;* along with this program; if not, write to the Free Software Foundation,
+;* Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+;*****************************************************************************/
+
+%include "x86util.asm"
+
+SECTION .text
+
+%macro MERGE 2
+cglobal merge_%1bit, 4,4,2, 0, dest, src1, src2, bytes
+%if mmsize > 16
+ test bytesq, 16
+ je .loop
+ movu xm0, [src1q + bytesq - 16]
+ movu xm1, [src2q + bytesq - 16]
+ %2 xm0, xm1
+ movu [destq + bytesq - 16], xm0
+ sub bytesq, 16
+ je .end
+%endif
+.loop:
+ sub bytesq, mmsize
+ movu m0, [src1q + bytesq]
+ movu m1, [src2q + bytesq]
+ %2 m0, m1
+ movu [destq + bytesq], m0
+ jg .loop
+.end:
+ REP_RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_MMX mmx2
+MERGE 8, pavgb
+MERGE 16, pavgw
+
+INIT_MMX 3dnow
+MERGE 8, pavgusb
+%endif
+
+INIT_XMM sse2
+MERGE 8, pavgb
+MERGE 16, pavgw
+
+INIT_YMM avx2
+MERGE 8, pavgb
+MERGE 16, pavgw
--
2.11.0.rc2
More information about the vlc-devel
mailing list