[vlc-devel] [PATCH 15/25] deinterlace: remove AltiVec, 3dNow! and MMX

Tue Apr 14 12:40:26 CEST 2020

Note that I remove the EndMerge routine even though it is used for SSE2:
using "emms" after using SSE2 instructions is absolutly useless.
MMX and x87 register files alias one another, that's why we need to
restore the state after using MMX instructions.
XMM registers do not suffer from that issue.
---
 modules/video_filter/deinterlace/algo_basic.c |   3 -
 .../video_filter/deinterlace/deinterlace.c    |  29 ----
 modules/video_filter/deinterlace/helpers.c    |   1 -
 modules/video_filter/deinterlace/merge.c      | 146 ------------------
 modules/video_filter/deinterlace/merge.h      |  83 +---------
 5 files changed, 3 insertions(+), 259 deletions(-)

diff --git a/modules/video_filter/deinterlace/algo_basic.c b/modules/video_filter/deinterlace/algo_basic.c
index 8d675e01c6..1d652d634b 100644
--- a/modules/video_filter/deinterlace/algo_basic.c
+++ b/modules/video_filter/deinterlace/algo_basic.c
@@ -180,7 +180,6 @@ int RenderLinear( filter_t *p_filter,
             memcpy( p_out, p_in, p_pic->p[i_plane].i_pitch );
         }
     }
-    EndMerge();
     return VLC_SUCCESS;
 }
 
@@ -215,7 +214,6 @@ int RenderMean( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
             p_in += 2 * p_pic->p[i_plane].i_pitch;
         }
     }
-    EndMerge();
     return VLC_SUCCESS;
 }
 
@@ -254,6 +252,5 @@ int RenderBlend( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
             p_in  += p_pic->p[i_plane].i_pitch;
         }
     }
-    EndMerge();
     return VLC_SUCCESS;
 }
diff --git a/modules/video_filter/deinterlace/deinterlace.c b/modules/video_filter/deinterlace/deinterlace.c
index 01d7ad5e61..9d33d434a9 100644
--- a/modules/video_filter/deinterlace/deinterlace.c
+++ b/modules/video_filter/deinterlace/deinterlace.c
@@ -537,33 +537,9 @@ notsupp:
 
     IVTCClearState( p_filter );
 
-#if defined(CAN_COMPILE_C_ALTIVEC)
-    if( pixel_size == 1 && vlc_CPU_ALTIVEC() )
-        p_sys->pf_merge = MergeAltivec;
-    else
-#endif
 #if defined(CAN_COMPILE_SSE2)
     if( vlc_CPU_SSE2() )
-    {
         p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
-        p_sys->pf_end_merge = EndMMX;
-    }
-    else
-#endif
-#if defined(CAN_COMPILE_MMXEXT)
-    if( pixel_size == 1 && vlc_CPU_MMXEXT() )
-    {
-        p_sys->pf_merge = MergeMMXEXT;
-        p_sys->pf_end_merge = EndMMX;
-    }
-    else
-#endif
-#if defined(CAN_COMPILE_3DNOW)
-    if( pixel_size == 1 && vlc_CPU_3dNOW() )
-    {
-        p_sys->pf_merge = Merge3DNow;
-        p_sys->pf_end_merge = End3DNow;
-    }
     else
 #endif
 #if defined(CAN_COMPILE_ARM)
@@ -584,12 +560,7 @@ notsupp:
         p_sys->pf_merge = pixel_size == 1 ? merge8_arm64_neon : merge16_arm64_neon;
     else
 #endif
-    {
         p_sys->pf_merge = pixel_size == 1 ? Merge8BitGeneric : Merge16BitGeneric;
-#if defined(__i386__) || defined(__x86_64__)
-        p_sys->pf_end_merge = NULL;
-#endif
-    }
 
     /* */
     video_format_t fmt;
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index a882a408c1..a259b992d6 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -374,7 +374,6 @@ void ComposeFrame( filter_t *p_filter,
                     p_in_top    += p_inpic_top->p[i_plane].i_pitch;
                     p_in_bottom += p_inpic_bottom->p[i_plane].i_pitch;
                 }
-                EndMerge();
             }
         }
     }
diff --git a/modules/video_filter/deinterlace/merge.c b/modules/video_filter/deinterlace/merge.c
index 8d36479ca1..40d758a377 100644
--- a/modules/video_filter/deinterlace/merge.c
+++ b/modules/video_filter/deinterlace/merge.c
@@ -33,14 +33,6 @@
 #include <vlc_cpu.h>
 #include "merge.h"
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
-#ifdef HAVE_ALTIVEC_H
-#   include <altivec.h>
-#endif
-
 /*****************************************************************************
  * Merge (line blending) routines
  *****************************************************************************/
@@ -67,58 +59,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
 }
 
-#if defined(CAN_COMPILE_MMXEXT)
-VLC_MMX
-void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                  size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes >= 8; i_bytes -= 8 )
-    {
-        __asm__  __volatile__( "movq %2,%%mm1;"
-                               "pavgb %1, %%mm1;"
-                               "movq %%mm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "mm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-VLC_MMX
-void Merge3DNow( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                 size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes >= 8; i_bytes -= 8 )
-    {
-        __asm__  __volatile__( "movq %2,%%mm1;"
-                               "pavgusb %1, %%mm1;"
-                               "movq %%mm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "mm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
 #if defined(CAN_COMPILE_SSE)
 VLC_SSE
 void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
@@ -176,89 +116,3 @@ void Merge16BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
 }
 
 #endif
-
-#ifdef CAN_COMPILE_C_ALTIVEC
-void MergeAltivec( void *_p_dest, const void *_p_s1,
-                   const void *_p_s2, size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-    uint8_t *p_end  = p_dest + i_bytes - 15;
-
-    /* Use C until the first 16-bytes aligned destination pixel */
-    while( (uintptr_t)p_dest & 0xF )
-    {
-        *p_dest++ = ( (uint16_t)(*p_s1++) + (uint16_t)(*p_s2++) ) >> 1;
-    }
-
-    if( ( (int)p_s1 & 0xF ) | ( (int)p_s2 & 0xF ) )
-    {
-        /* Unaligned source */
-        vector unsigned char s1v, s2v, destv;
-        vector unsigned char s1oldv, s2oldv, s1newv, s2newv;
-        vector unsigned char perm1v, perm2v;
-
-        perm1v = vec_lvsl( 0, p_s1 );
-        perm2v = vec_lvsl( 0, p_s2 );
-        s1oldv = vec_ld( 0, p_s1 );
-        s2oldv = vec_ld( 0, p_s2 );
-
-        while( p_dest < p_end )
-        {
-            s1newv = vec_ld( 16, p_s1 );
-            s2newv = vec_ld( 16, p_s2 );
-            s1v    = vec_perm( s1oldv, s1newv, perm1v );
-            s2v    = vec_perm( s2oldv, s2newv, perm2v );
-            s1oldv = s1newv;
-            s2oldv = s2newv;
-            destv  = vec_avg( s1v, s2v );
-            vec_st( destv, 0, p_dest );
-
-            p_s1   += 16;
-            p_s2   += 16;
-            p_dest += 16;
-        }
-    }
-    else
-    {
-        /* Aligned source */
-        vector unsigned char s1v, s2v, destv;
-
-        while( p_dest < p_end )
-        {
-            s1v   = vec_ld( 0, p_s1 );
-            s2v   = vec_ld( 0, p_s2 );
-            destv = vec_avg( s1v, s2v );
-            vec_st( destv, 0, p_dest );
-
-            p_s1   += 16;
-            p_s2   += 16;
-            p_dest += 16;
-        }
-    }
-
-    p_end += 15;
-
-    while( p_dest < p_end )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-void EndMMX( void )
-{
-    __asm__ __volatile__( "emms" :: );
-}
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-void End3DNow( void )
-{
-    __asm__ __volatile__( "femms" :: );
-}
-#endif
diff --git a/modules/video_filter/deinterlace/merge.h b/modules/video_filter/deinterlace/merge.h
index 26a03fee1f..eb62ffe16c 100644
--- a/modules/video_filter/deinterlace/merge.h
+++ b/modules/video_filter/deinterlace/merge.h
@@ -35,11 +35,11 @@
  * Macros
  *****************************************************************************/
 
-/* Convenient Merge() and EndMerge() macros to pick the most appropriate
-   merge implementation automatically.
+/* Convenient Merge() macro to pick the most appropriate merge implementation
+ * automatically.
 
    Note that you'll need to include vlc_filter.h and deinterlace.h
-   to use these.
+   to use it.
 
  * Note that the Open() call of the deinterlace filter automatically selects
  * the most appropriate merge routine based on the CPU capabilities.
@@ -59,17 +59,6 @@
  */
 #define Merge p_sys->pf_merge
 
-/*
- * EndMerge() macro, which must be called after the merge is
- * finished, if the Merge() macro was used to perform the merge.
- */
-#if defined(__i386__) || defined(__x86_64__)
-# define EndMerge() \
-    if(p_sys->pf_end_merge) (p_sys->pf_end_merge)()
-#else
-# define EndMerge() (void)0
-#endif
-
 /*****************************************************************************
  * Merge routines
  *****************************************************************************/
@@ -100,42 +89,6 @@ void Merge8BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
 void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
                         size_t i_bytes );
 
-#if defined(CAN_COMPILE_C_ALTIVEC)
-/**
- * Altivec routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
- */
-void MergeAltivec ( void *, const void *, const void *, size_t );
-#endif
-
-#if defined(CAN_COMPILE_MMXEXT)
-/**
- * MMXEXT routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
- */
-void MergeMMXEXT  ( void *, const void *, const void *, size_t );
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-/**
- * 3DNow routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
- */
-void Merge3DNow   ( void *, const void *, const void *, size_t );
-#endif
-
 #if defined(CAN_COMPILE_SSE)
 /**
  * SSE2 routine to blend pixels from two picture lines.
@@ -183,34 +136,4 @@ void merge16_arm64_neon (void *, const void *, const void *, size_t);
 void merge8_arm_sve(void *, const void *, const void *, size_t);
 void merge16_arm_sve(void *, const void *, const void *, size_t);
 
-/*****************************************************************************
- * EndMerge routines
- *****************************************************************************/
-
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-/**
- * MMX merge finalization routine.
- *
- * Must be called after an MMX merge is finished.
- * This exits MMX mode (by executing the "emms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void EndMMX       ( void );
-#endif
-
-#if defined(CAN_COMPILE_3DNOW)
-/**
- * 3DNow merge finalization routine.
- *
- * Must be called after a 3DNow merge is finished.
- * This exits 3DNow mode (by executing the "femms" instruction).
- *
- * The EndMerge() macro detects whether this is needed, and calls if it is,
- * so just use that.
- */
-void End3DNow     ( void );
-#endif
-
 #endif
-- 
2.24.1