[vlc-commits] [Git][videolan/vlc][master] 2 commits: deinterlace: purge MMX/MMXEXT

François Cartegnie (@fcartegnie) gitlab at videolan.org
Tue Jul 20 15:13:20 UTC 2021



François Cartegnie pushed to branch master at VideoLAN / VLC


Commits:
831fa3cd by Lyndon Brown at 2021-07-20T15:02:06+00:00
deinterlace: purge MMX/MMXEXT

notes:
 - this removes all SIMD acceleration for x86/x86_64. originally this work
   started by converting the MMX code to SSE2, then purged remaining
   artifacts, but a build error on android has blocked that work from
   being merged for now. this commit thus takes a different approach of
   simply purging the old MMX/MMXEXT code first, with getting the SSE2
   implementation working to be done as a follow up.
 - the `EndMMX()` function is retained (renamed to `EndSSE()`) because it
   is still used under the merge code. the `emms` instruction will be
   replaced with an `sfence` instruction separately, as more appropriate.

- - - - -
dd38fdc4 by Lyndon Brown at 2021-07-20T15:02:06+00:00
deinterlace: use sfence instead of emms for SSE2

we're purging all MMX/MMXEXT code; `sfence` is more appropriate.

- - - - -


14 changed files:

- modules/video_filter/Makefile.am
- modules/video_filter/deinterlace/algo_ivtc.c
- modules/video_filter/deinterlace/algo_phosphor.c
- modules/video_filter/deinterlace/algo_x.c
- modules/video_filter/deinterlace/algo_x.h
- modules/video_filter/deinterlace/algo_yadif.c
- modules/video_filter/deinterlace/deinterlace.c
- modules/video_filter/deinterlace/deinterlace.h
- modules/video_filter/deinterlace/helpers.c
- modules/video_filter/deinterlace/merge.c
- modules/video_filter/deinterlace/merge.h
- − modules/video_filter/deinterlace/mmx.h
- modules/video_filter/deinterlace/yadif.h
- modules/video_filter/deinterlace/yadif_x86.asm


Changes:

=====================================
modules/video_filter/Makefile.am
=====================================
@@ -159,7 +159,6 @@ noinst_LTLIBRARIES += libdeinterlace_common.la
 
 libdeinterlace_plugin_la_SOURCES = \
 	video_filter/deinterlace/deinterlace.c video_filter/deinterlace/deinterlace.h \
-        video_filter/deinterlace/mmx.h \
 	video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
 	video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
 	video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \


=====================================
modules/video_filter/deinterlace/algo_ivtc.c
=====================================
@@ -24,10 +24,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #include <stdint.h>
 #include <assert.h>
 


=====================================
modules/video_filter/deinterlace/algo_phosphor.c
=====================================
@@ -24,11 +24,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#   include <stdalign.h>
-#endif
-
 #include <stdint.h>
 #include <assert.h>
 
@@ -87,7 +82,7 @@ static void DarkenField( picture_t *p_dst,
        For luma, the operation is just a shift + bitwise AND, so we vectorize
        even in the C version.
 
-       There is an MMX version too, because it performs about twice faster.
+       There are SIMD versions too, which perform significantly faster.
     */
     int i_plane = Y_PLANE;
     uint8_t *p_out, *p_out_end;
@@ -120,7 +115,7 @@ static void DarkenField( picture_t *p_dst,
 
        The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
        The chroma processing is a bit more complicated than luma,
-       and needs MMX for vectorization.
+       and needs SIMD for vectorization.
     */
     if( process_chroma )
     {
@@ -148,129 +143,6 @@ static void DarkenField( picture_t *p_dst,
     } /* if process_chroma */
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static void DarkenFieldMMX( picture_t *p_dst,
-                            const int i_field, const int i_strength,
-                            bool process_chroma )
-{
-    assert( p_dst != NULL );
-    assert( i_field == 0 || i_field == 1 );
-    assert( i_strength >= 1 && i_strength <= 3 );
-
-    uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
-    const uint8_t  remove_high_u8 = 0xFF >> i_strength;
-    const uint64_t remove_high_u64 = remove_high_u8 *
-                                            INT64_C(0x0101010101010101);
-
-    int i_plane = Y_PLANE;
-    uint8_t *p_out, *p_out_end;
-    int w = p_dst->p[i_plane].i_visible_pitch;
-    p_out = p_dst->p[i_plane].p_pixels;
-    p_out_end = p_out + p_dst->p[i_plane].i_pitch
-                      * p_dst->p[i_plane].i_visible_lines;
-
-    /* skip first line for bottom field */
-    if( i_field == 1 )
-        p_out += p_dst->p[i_plane].i_pitch;
-
-    int wm8 = w % 8;   /* remainder */
-    int w8  = w - wm8; /* part of width that is divisible by 8 */
-    for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
-    {
-        uint64_t *po = (uint64_t *)p_out;
-        int x = 0;
-
-        movq_m2r( i_strength_u64,  mm1 );
-        movq_m2r( remove_high_u64, mm2 );
-        for( ; x < w8; x += 8 )
-        {
-            movq_m2r( (*po), mm0 );
-
-            psrlq_r2r( mm1, mm0 );
-            pand_r2r(  mm2, mm0 );
-
-            movq_r2m( mm0, (*po++) );
-        }
-
-        /* handle the width remainder */
-        uint8_t *po_temp = (uint8_t *)po;
-        for( ; x < w; ++x, ++po_temp )
-            (*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
-    }
-
-    /* Process chroma if the field chromas are independent.
-
-       The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
-       The chroma processing is a bit more complicated than luma,
-       and needs MMX for vectorization.
-    */
-    if( process_chroma )
-    {
-        for( i_plane++ /* luma already handled */;
-             i_plane < p_dst->i_planes;
-             i_plane++ )
-        {
-            w = p_dst->p[i_plane].i_visible_pitch;
-            wm8 = w % 8;   /* remainder */
-            w8  = w - wm8; /* part of width that is divisible by 8 */
-
-            p_out = p_dst->p[i_plane].p_pixels;
-            p_out_end = p_out + p_dst->p[i_plane].i_pitch
-                              * p_dst->p[i_plane].i_visible_lines;
-
-            /* skip first line for bottom field */
-            if( i_field == 1 )
-                p_out += p_dst->p[i_plane].i_pitch;
-
-            for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
-            {
-                int x = 0;
-
-                /* See also easy-to-read C version below. */
-                static alignas (8) const mmx_t b128 = {
-                    .uq = 0x8080808080808080ULL
-                };
-
-                movq_m2r( b128, mm5 );
-                movq_m2r( i_strength_u64,  mm6 );
-                movq_m2r( remove_high_u64, mm7 );
-
-                uint64_t *po8 = (uint64_t *)p_out;
-                for( ; x < w8; x += 8 )
-                {
-                    movq_m2r( (*po8), mm0 );
-
-                    movq_r2r( mm5, mm2 ); /* 128 */
-                    movq_r2r( mm0, mm1 ); /* copy of data */
-                    psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
-                    psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
-
-                    /* >> i_strength */
-                    psrlq_r2r( mm6, mm1 );
-                    psrlq_r2r( mm6, mm2 );
-                    pand_r2r(  mm7, mm1 );
-                    pand_r2r(  mm7, mm2 );
-
-                    /* collect results from pos./neg. parts */
-                    psubb_r2r( mm2, mm1 );
-                    paddb_r2r( mm5, mm1 );
-
-                    movq_r2m( mm1, (*po8++) );
-                }
-
-                /* C version - handle the width remainder */
-                uint8_t *po = p_out;
-                for( ; x < w; ++x, ++po )
-                    (*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
-            } /* for p_out... */
-        } /* for i_plane... */
-    } /* if process_chroma */
-
-    emms();
-}
-#endif
-
 /*****************************************************************************
  * Public functions
  *****************************************************************************/
@@ -357,13 +229,6 @@ int RenderPhosphor( filter_t *p_filter,
     */
     if( p_sys->phosphor.i_dimmer_strength > 0 )
     {
-#ifdef CAN_COMPILE_MMXEXT
-        if( vlc_CPU_MMXEXT() )
-            DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
-                p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
-                p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
-        else
-#endif
             DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
                 p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
                 p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );


=====================================
modules/video_filter/deinterlace/algo_x.c
=====================================
@@ -24,10 +24,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #include <stdint.h>
 
 #include <vlc_common.h>
@@ -76,71 +72,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 
     return fc < 1 ? false : true;
 }
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
-{
-
-    int y, x;
-    int32_t ff, fr;
-    int fc;
-
-    /* Detect interlacing */
-    fc = 0;
-    pxor_r2r( mm7, mm7 );
-    for( y = 0; y < 9; y += 2 )
-    {
-        ff = fr = 0;
-        pxor_r2r( mm5, mm5 );
-        pxor_r2r( mm6, mm6 );
-        for( x = 0; x < 8; x+=4 )
-        {
-            movd_m2r( src[        x], mm0 );
-            movd_m2r( src[1*i_src+x], mm1 );
-            movd_m2r( src[2*i_src+x], mm2 );
-            movd_m2r( src[3*i_src+x], mm3 );
-
-            punpcklbw_r2r( mm7, mm0 );
-            punpcklbw_r2r( mm7, mm1 );
-            punpcklbw_r2r( mm7, mm2 );
-            punpcklbw_r2r( mm7, mm3 );
-
-            movq_r2r( mm0, mm4 );
-
-            psubw_r2r( mm1, mm0 );
-            psubw_r2r( mm2, mm4 );
-
-            psubw_r2r( mm1, mm2 );
-            psubw_r2r( mm1, mm3 );
-
-            pmaddwd_r2r( mm0, mm0 );
-            pmaddwd_r2r( mm4, mm4 );
-            pmaddwd_r2r( mm2, mm2 );
-            pmaddwd_r2r( mm3, mm3 );
-            paddd_r2r( mm0, mm2 );
-            paddd_r2r( mm4, mm3 );
-            paddd_r2r( mm2, mm5 );
-            paddd_r2r( mm3, mm6 );
-        }
-
-        movq_r2r( mm5, mm0 );
-        psrlq_i2r( 32, mm0 );
-        paddd_r2r( mm0, mm5 );
-        movd_r2m( mm5, fr );
-
-        movq_r2r( mm6, mm0 );
-        psrlq_i2r( 32, mm0 );
-        paddd_r2r( mm0, mm6 );
-        movd_r2m( mm6, ff );
-
-        if( ff < 6*fr/8 && fr > 32 )
-            fc++;
-
-        src += 2*i_src;
-    }
-    return fc;
-}
-#endif
 
 static inline void XDeint8x8MergeC( uint8_t *dst,  int i_dst,
                                     uint8_t *src1, int i_src1,
@@ -163,49 +94,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst,  int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
-                                         uint8_t *src1, int i_src1,
-                                         uint8_t *src2, int i_src2 )
-{
-    static const uint64_t m_4 = INT64_C(0x0004000400040004);
-    int y, x;
-
-    /* Progressive */
-    pxor_r2r( mm7, mm7 );
-    for( y = 0; y < 8; y += 2 )
-    {
-        for( x = 0; x < 8; x +=4 )
-        {
-            movd_m2r( src1[x], mm0 );
-            movd_r2m( mm0, dst[x] );
-
-            movd_m2r( src2[x], mm1 );
-            movd_m2r( src1[i_src1+x], mm2 );
-
-            punpcklbw_r2r( mm7, mm0 );
-            punpcklbw_r2r( mm7, mm1 );
-            punpcklbw_r2r( mm7, mm2 );
-            paddw_r2r( mm1, mm1 );
-            movq_r2r( mm1, mm3 );
-            paddw_r2r( mm3, mm3 );
-            paddw_r2r( mm2, mm0 );
-            paddw_r2r( mm3, mm1 );
-            paddw_m2r( m_4, mm1 );
-            paddw_r2r( mm1, mm0 );
-            psraw_i2r( 3, mm0 );
-            packuswb_r2r( mm7, mm0 );
-            movd_r2m( mm0, dst[i_dst+x] );
-        }
-        dst += 2*i_dst;
-        src1 += i_src1;
-        src2 += i_src2;
-    }
-}
-
-#endif
-
 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
  * neighbour
  * (Use 8x9 pixels)
@@ -229,31 +117,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
-                                          uint8_t *src, int i_src )
-{
-    int y;
-
-    /* Interlaced */
-    for( y = 0; y < 8; y += 2 )
-    {
-        movq_m2r( src[0], mm0 );
-        movq_r2m( mm0, dst[0] );
-        dst += i_dst;
-
-        movq_m2r( src[2*i_src], mm1 );
-        pavgb_r2r( mm1, mm0 );
-
-        movq_r2m( mm0, dst[0] );
-
-        dst += 1*i_dst;
-        src += 2*i_src;
-    }
-}
-#endif
-
 /* XDeint8x8Field: Edge oriented interpolation
  * (Need -4 and +5 pixels H, +1 line)
  */
@@ -271,7 +134,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
         for( x = 0; x < 8; x++ )
         {
             uint8_t *src2 = &src[2*i_src];
-            /* I use 8 pixels just to match the MMX version, but it's overkill
+            /* I use 8 pixels just to match the SIMD version, but it's overkill
              * 5 would be enough (less isn't good) */
             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
@@ -301,50 +164,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
-                                         uint8_t *src, int i_src )
-{
-    int y, x;
-
-    /* Interlaced */
-    for( y = 0; y < 8; y += 2 )
-    {
-        memcpy( dst, src, 8 );
-        dst += i_dst;
-
-        for( x = 0; x < 8; x++ )
-        {
-            uint8_t *src2 = &src[2*i_src];
-            int32_t c0, c1, c2;
-
-            movq_m2r( src[x-2], mm0 );
-            movq_m2r( src[x-3], mm1 );
-            movq_m2r( src[x-4], mm2 );
-
-            psadbw_m2r( src2[x-4], mm0 );
-            psadbw_m2r( src2[x-3], mm1 );
-            psadbw_m2r( src2[x-2], mm2 );
-
-            movd_r2m( mm0, c2 );
-            movd_r2m( mm1, c1 );
-            movd_r2m( mm2, c0 );
-
-            if( c0 < c1 && c1 <= c2 )
-                dst[x] = (src[x-1] + src2[x+1]) >> 1;
-            else if( c2 < c1 && c1 <= c0 )
-                dst[x] = (src[x+1] + src2[x-1]) >> 1;
-            else
-                dst[x] = (src[x+0] + src2[x+0]) >> 1;
-        }
-
-        dst += 1*i_dst;
-        src += 2*i_src;
-    }
-}
-#endif
-
 /* NxN arbitray size (and then only use pixel in the NxN block)
  */
 static inline int XDeintNxNDetect( uint8_t *src, int i_src,
@@ -472,41 +291,6 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
-                                        uint8_t *src, int i_src,
-                                        const int i_mbx, int i_modx )
-{
-    int x;
-
-    /* Reset current line */
-    for( x = 0; x < i_mbx; x++ )
-    {
-        int s;
-        if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
-        {
-            if( x == 0 || x == i_mbx - 1 )
-                XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
-            else
-                XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
-        }
-        else
-        {
-            XDeint8x8MergeMMXEXT( dst, i_dst,
-                                  &src[0*i_src], 2*i_src,
-                                  &src[1*i_src], 2*i_src );
-        }
-
-        dst += 8;
-        src += 8;
-    }
-
-    if( i_modx )
-        XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
-}
-#endif
-
 /*****************************************************************************
  * Public functions
  *****************************************************************************/
@@ -515,9 +299,6 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
 {
     VLC_UNUSED(p_filter);
     int i_plane;
-#if defined (CAN_COMPILE_MMXEXT)
-    const bool mmxext = vlc_CPU_MMXEXT();
-#endif
 
     /* Copy image and skip lines */
     for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
@@ -538,12 +319,7 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
 
-#ifdef CAN_COMPILE_MMXEXT
-            if( mmxext )
-                XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
-            else
-#endif
-                XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
+            XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
         }
 
         /* Last line (C only)*/
@@ -565,9 +341,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
         }
     }
 
-#ifdef CAN_COMPILE_MMXEXT
-    if( mmxext )
-        emms();
-#endif
     return VLC_SUCCESS;
 }


=====================================
modules/video_filter/deinterlace/algo_x.h
=====================================
@@ -33,13 +33,13 @@ struct picture_t;
 /**
  * Interpolating deinterlace filter "X".
  *
- * The algorithm works on a 8x8 block basic, it copies the top field
+ * The algorithm works on a 8x8 block basis; It copies the top field
  * and applies a process to recreate the bottom field.
  *
  * If a 8x8 block is classified as :
  *   - progressive: it applies a small blend (1,6,1)
  *   - interlaced:
- *    * in the MMX version: we do a ME between the 2 fields, if there is a
+ *    * in the SIMD version: we do a ME between the 2 fields, if there is a
  *      good match we use MC to recreate the bottom field (with a small
  *      blend (1,6,1) )
  *    * otherwise: it recreates the bottom field by an edge oriented


=====================================
modules/video_filter/deinterlace/algo_yadif.c
=====================================
@@ -119,11 +119,6 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
         if( vlc_CPU_SSE2() )
             filter = vlcpriv_yadif_filter_line_sse2;
         else
-#if defined(__i386__)
-        if( vlc_CPU_MMXEXT() )
-            filter = vlcpriv_yadif_filter_line_mmxext;
-        else
-#endif
 #endif
             filter = yadif_filter_line_c;
 


=====================================
modules/video_filter/deinterlace/deinterlace.c
=====================================
@@ -558,15 +558,7 @@ notsupp:
     if( vlc_CPU_SSE2() )
     {
         p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
-        p_sys->pf_end_merge = EndMMX;
-    }
-    else
-#endif
-#if defined(CAN_COMPILE_MMXEXT)
-    if( pixel_size == 1 && vlc_CPU_MMXEXT() )
-    {
-        p_sys->pf_merge = MergeMMXEXT;
-        p_sys->pf_end_merge = EndMMX;
+        p_sys->pf_end_merge = EndSSE;
     }
     else
 #endif


=====================================
modules/video_filter/deinterlace/deinterlace.h
=====================================
@@ -68,7 +68,7 @@ typedef struct
 {
     const vlc_chroma_description_t *chroma;
 
-    /** Merge routine: C, MMX, SSE, ALTIVEC, NEON, ... */
+    /** Merge routine: C, SSE, ALTIVEC, NEON, ... */
     void (*pf_merge) ( void *, const void *, const void *, size_t );
 #if defined (__i386__) || defined (__x86_64__)
     /** Merge finalization routine for SSE */


=====================================
modules/video_filter/deinterlace/helpers.c
=====================================
@@ -24,11 +24,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#   include <stdalign.h>
-#endif
-
 #include <stdint.h>
 #include <assert.h>
 
@@ -107,9 +102,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
  * For interpretation of pi_top and pi_bot, it is assumed that the block
  * starts on an even-numbered line (belonging to the top field).
  *
- * The b_mmx parameter avoids the need to call vlc_CPU() separately
- * for each block.
- *
  * @param[in] p_pix_p Base pointer to the block in previous picture
  * @param[in] p_pix_c Base pointer to the same block in current picture
  * @param i_pitch_prev i_pitch of previous picture
@@ -172,79 +164,6 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
        changes "enough". */
     return (i_motion >= 8);
 }
-
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
-                                    int i_pitch_prev, int i_pitch_curr,
-                                    int* pi_top, int* pi_bot )
-{
-    int32_t i_motion = 0;
-    int32_t i_top_motion = 0;
-    int32_t i_bot_motion = 0;
-
-    static alignas (8) const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
-    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
-    movq_m2r( bT,  mm5 );
-
-    pxor_r2r( mm3, mm3 ); /* score (top field) */
-    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
-    for( int y = 0; y < 8; y+=2 )
-    {
-        /* top field */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
-
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
-
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm3 ); /* add to top field score */
-
-        p_pix_c += i_pitch_curr;
-        p_pix_p += i_pitch_prev;
-
-        /* bottom field - handling identical to top field, except... */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
-
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
-
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
-
-        p_pix_c += i_pitch_curr;
-        p_pix_p += i_pitch_prev;
-    }
-    movq_r2r(  mm3, mm7 ); /* score (total) */
-    paddd_r2r( mm4, mm7 );
-    movd_r2m( mm3, i_top_motion );
-    movd_r2m( mm4, i_bot_motion );
-    movd_r2m( mm7, i_motion );
-
-    /* The loop counts actual score * 255. */
-    i_top_motion /= 255;
-    i_bot_motion /= 255;
-    i_motion     /= 255;
-
-    emms();
-
-    (*pi_top) = ( i_top_motion >= 8 );
-    (*pi_bot) = ( i_bot_motion >= 8 );
-    return (i_motion >= 8);
-}
-#endif
 #undef T
 
 /*****************************************************************************
@@ -396,11 +315,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 
     int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
         TestForMotionInBlock;
-    /* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
-    if (vlc_CPU_MMXEXT())
-        motion_in_block = TestForMotionInBlockMMX;
-#endif
 
     int i_score = 0;
     for( int i_plane = 0 ; i_plane < p_prev->i_planes ; i_plane++ )
@@ -451,142 +365,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 /* Threshold (value from Transcode 1.1.5) */
 #define T 100
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
-                                       const picture_t* p_pic_bot )
-{
-    assert( p_pic_top->i_planes == p_pic_bot->i_planes );
-
-    /* Amount of bits must be known for MMX, thus int32_t.
-       Doesn't hurt the C implementation. */
-    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
-    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */
-
-    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
-
-    for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
-    {
-        /* Sanity check */
-        if( p_pic_top->p[i_plane].i_visible_lines !=
-            p_pic_bot->p[i_plane].i_visible_lines )
-            return -1;
-
-        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
-        const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
-                             p_pic_bot->p[i_plane].i_visible_pitch );
-        const int wm8 = w % 8;   /* remainder */
-        const int w8  = w - wm8; /* part of width that is divisible by 8 */
-
-        /* Current line / neighbouring lines picture pointers */
-        const picture_t *cur = p_pic_bot;
-        const picture_t *ngh = p_pic_top;
-        int wc = cur->p[i_plane].i_pitch;
-        int wn = ngh->p[i_plane].i_pitch;
-
-        /* Transcode 1.1.5 only checks every other line. Checking every line
-           works better for anime, which may contain horizontal,
-           one pixel thick cartoon outlines.
-        */
-        for( int y = 1; y < i_lasty; ++y )
-        {
-            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
-            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
-            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
-
-            int x = 0;
-
-            /* Easy-to-read C version further below.
-
-               Assumptions: 0 < T < 127
-                            # of pixels < (2^32)/255
-               Note: calculates score * 255
-            */
-            static alignas (8) const mmx_t b0 = {
-                .uq = 0x0000000000000000ULL };
-            static alignas (8) const mmx_t b128 = {
-                .uq = 0x8080808080808080ULL };
-            static alignas (8) const mmx_t bT = {
-                .ub = { T, T, T, T, T, T, T, T } };
-
-            for( ; x < w8; x += 8 )
-            {
-                movq_m2r( *((int64_t*)p_c), mm0 );
-                movq_m2r( *((int64_t*)p_p), mm1 );
-                movq_m2r( *((int64_t*)p_n), mm2 );
-
-                psubb_m2r( b128, mm0 );
-                psubb_m2r( b128, mm1 );
-                psubb_m2r( b128, mm2 );
-
-                psubsb_r2r( mm0, mm1 );
-                psubsb_r2r( mm0, mm2 );
-
-                pxor_r2r( mm3, mm3 );
-                pxor_r2r( mm4, mm4 );
-                pxor_r2r( mm5, mm5 );
-                pxor_r2r( mm6, mm6 );
-
-                punpcklbw_r2r( mm1, mm3 );
-                punpcklbw_r2r( mm2, mm4 );
-                punpckhbw_r2r( mm1, mm5 );
-                punpckhbw_r2r( mm2, mm6 );
-
-                pmulhw_r2r( mm3, mm4 );
-                pmulhw_r2r( mm5, mm6 );
-
-                packsswb_r2r(mm4, mm6);
-                pcmpgtb_m2r( bT, mm6 );
-                psadbw_m2r( b0, mm6 );
-                paddd_r2r( mm6, mm7 );
-
-                p_c += 8;
-                p_p += 8;
-                p_n += 8;
-            }
-
-            for( ; x < w; ++x )
-            {
-                /* Worst case: need 17 bits for "comb". */
-                int_fast32_t C = *p_c;
-                int_fast32_t P = *p_p;
-                int_fast32_t N = *p_n;
-
-                /* Comments in Transcode's filter_ivtc.c attribute this
-                   combing metric to Gunnar Thalin.
-
-                    The idea is that if the picture is interlaced, both
-                    expressions will have the same sign, and this comes
-                    up positive. The value T = 100 has been chosen such
-                    that a pixel difference of 10 (on average) will
-                    trigger the detector.
-                */
-                int_fast32_t comb = (P - C) * (N - C);
-                if( comb > T )
-                    ++i_score_c;
-
-                ++p_c;
-                ++p_p;
-                ++p_n;
-            }
-
-            /* Now the other field - swap current and neighbour pictures */
-            const picture_t *tmp = cur;
-            cur = ngh;
-            ngh = tmp;
-            int tmp_pitch = wc;
-            wc = wn;
-            wn = tmp_pitch;
-        }
-    }
-
-    movd_r2m( mm7, i_score_mmx );
-    emms();
-
-    return i_score_mmx/255 + i_score_c;
-}
-#endif
-
 /* See header for function doc. */
 int CalculateInterlaceScore( const picture_t* p_pic_top,
                              const picture_t* p_pic_bot )
@@ -607,11 +385,6 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
     if( p_pic_top->i_planes != p_pic_bot->i_planes )
         return -1;
 
-#ifdef CAN_COMPILE_MMXEXT
-    if (vlc_CPU_MMXEXT())
-        return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
-#endif
-
     int32_t i_score = 0;
 
     for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )


=====================================
modules/video_filter/deinterlace/merge.c
=====================================
@@ -33,10 +33,6 @@
 #include <vlc_cpu.h>
 #include "merge.h"
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #ifdef HAVE_ALTIVEC_H
 #   include <altivec.h>
 #endif
@@ -67,32 +63,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
         *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
 }
 
-#if defined(CAN_COMPILE_MMXEXT)
-VLC_MMX
-void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
-                  size_t i_bytes )
-{
-    uint8_t *p_dest = _p_dest;
-    const uint8_t *p_s1 = _p_s1;
-    const uint8_t *p_s2 = _p_s2;
-
-    for( ; i_bytes >= 8; i_bytes -= 8 )
-    {
-        __asm__  __volatile__( "movq %2,%%mm1;"
-                               "pavgb %1, %%mm1;"
-                               "movq %%mm1, %0" :"=m" (*p_dest):
-                                                 "m" (*p_s1),
-                                                 "m" (*p_s2) : "mm1" );
-        p_dest += 8;
-        p_s1 += 8;
-        p_s2 += 8;
-    }
-
-    for( ; i_bytes > 0; i_bytes-- )
-        *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
 #if defined(CAN_COMPILE_SSE)
 VLC_SSE
 void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
@@ -223,9 +193,9 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
  * EndMerge routines
  *****************************************************************************/
 
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-void EndMMX( void )
+#if defined(CAN_COMPILE_SSE2)
+void EndSSE( void )
 {
-    __asm__ __volatile__( "emms" :: );
+    __asm__ __volatile__( "sfence" ::: "memory" );
 }
 #endif


=====================================
modules/video_filter/deinterlace/merge.h
=====================================
@@ -112,18 +112,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
 void MergeAltivec ( void *, const void *, const void *, size_t );
 #endif
 
-#if defined(CAN_COMPILE_MMXEXT)
-/**
- * MMXEXT routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
- */
-void MergeMMXEXT  ( void *, const void *, const void *, size_t );
-#endif
-
 #if defined(CAN_COMPILE_SSE)
 /**
  * SSE2 routine to blend pixels from two picture lines.
@@ -175,17 +163,17 @@ void merge16_arm_sve(void *, const void *, const void *, size_t);
  * EndMerge routines
  *****************************************************************************/
 
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
+#if defined(CAN_COMPILE_SSE2)
 /**
- * MMX merge finalization routine.
+ * SSE merge finalization routine.
  *
- * Must be called after an MMX merge is finished.
- * This exits MMX mode (by executing the "emms" instruction).
+ * Should be called after an SSE merge is finished.
+ * This exits SSE mode (by executing the "sfence" instruction).
  *
  * The EndMerge() macro detects whether this is needed, and calls if it is,
  * so just use that.
  */
-void EndMMX       ( void );
+void EndSSE( void );
 #endif
 
 #endif


=====================================
modules/video_filter/deinterlace/mmx.h deleted
=====================================
@@ -1,256 +0,0 @@
-/*
- * mmx.h
- * Copyright (C) 1997-1999 H. Dietz and R. Fisher
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- */
-
-/*
- * The type of an value that fits in an MMX register (note that long
- * long constant values MUST be suffixed by LL and unsigned long long
- * values by ULL, lest they be truncated by the compiler)
- */
-
-#include <stdint.h>
-
-typedef    union {
-    int64_t          q;    /* Quadword (64-bit) value */
-    uint64_t        uq;    /* Unsigned Quadword */
-    int32_t          d[2]; /* 2 Doubleword (32-bit) values */
-    uint32_t        ud[2]; /* 2 Unsigned Doubleword */
-    int16_t          w[4]; /* 4 Word (16-bit) values */
-    uint16_t        uw[4]; /* 4 Unsigned Word */
-    int8_t           b[8]; /* 8 Byte (8-bit) values */
-    uint8_t         ub[8]; /* 8 Unsigned Byte */
-    float            s[2]; /* Single-precision (32-bit) value */
-} mmx_t; /* NOTE: must be on an 8-byte (64-bit) boundary */
-
-
-#define    mmx_i2r(op,imm,reg) \
-    __asm__ __volatile__ (#op " %0, %%" #reg \
-                  : /* nothing */ \
-                  : "i" (imm) \
-                  : #reg)
-
-#define    mmx_m2r(op,mem,reg) \
-    __asm__ __volatile__ (#op " %0, %%" #reg \
-                  : /* nothing */ \
-                  : "m" (mem) \
-                  : #reg)
-
-#define    mmx_r2m(op,reg,mem) \
-    __asm__ __volatile__ (#op " %%" #reg ", %0" \
-                  : "=m" (mem) \
-                  : /* nothing */ \
-                  : "memory")
-
-#define    mmx_r2r(op,regs,regd) \
-    __asm__ __volatile__ (#op " %%" #regs ", %%" #regd ::: #regd)
-
-
-#define    emms() __asm__ __volatile__ ("emms")
-
-#define    movd_m2r(var,reg)    mmx_m2r (movd, var, reg)
-#define    movd_r2m(reg,var)    mmx_r2m (movd, reg, var)
-#define    movd_r2r(regs,regd)    mmx_r2r (movd, regs, regd)
-
-#define    movq_m2r(var,reg)    mmx_m2r (movq, var, reg)
-#define    movq_r2m(reg,var)    mmx_r2m (movq, reg, var)
-#define    movq_r2r(regs,regd)    mmx_r2r (movq, regs, regd)
-
-#define    packssdw_m2r(var,reg)    mmx_m2r (packssdw, var, reg)
-#define    packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
-#define    packsswb_m2r(var,reg)    mmx_m2r (packsswb, var, reg)
-#define    packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
-
-#define    packuswb_m2r(var,reg)    mmx_m2r (packuswb, var, reg)
-#define    packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
-
-#define    paddb_m2r(var,reg)    mmx_m2r (paddb, var, reg)
-#define    paddb_r2r(regs,regd)    mmx_r2r (paddb, regs, regd)
-#define    paddd_m2r(var,reg)    mmx_m2r (paddd, var, reg)
-#define    paddd_r2r(regs,regd)    mmx_r2r (paddd, regs, regd)
-#define    paddw_m2r(var,reg)    mmx_m2r (paddw, var, reg)
-#define    paddw_r2r(regs,regd)    mmx_r2r (paddw, regs, regd)
-
-#define    paddsb_m2r(var,reg)    mmx_m2r (paddsb, var, reg)
-#define    paddsb_r2r(regs,regd)    mmx_r2r (paddsb, regs, regd)
-#define    paddsw_m2r(var,reg)    mmx_m2r (paddsw, var, reg)
-#define    paddsw_r2r(regs,regd)    mmx_r2r (paddsw, regs, regd)
-
-#define    paddusb_m2r(var,reg)    mmx_m2r (paddusb, var, reg)
-#define    paddusb_r2r(regs,regd)    mmx_r2r (paddusb, regs, regd)
-#define    paddusw_m2r(var,reg)    mmx_m2r (paddusw, var, reg)
-#define    paddusw_r2r(regs,regd)    mmx_r2r (paddusw, regs, regd)
-
-#define    pand_m2r(var,reg)    mmx_m2r (pand, var, reg)
-#define    pand_r2r(regs,regd)    mmx_r2r (pand, regs, regd)
-
-#define    pandn_m2r(var,reg)    mmx_m2r (pandn, var, reg)
-#define    pandn_r2r(regs,regd)    mmx_r2r (pandn, regs, regd)
-
-#define    pcmpeqb_m2r(var,reg)    mmx_m2r (pcmpeqb, var, reg)
-#define    pcmpeqb_r2r(regs,regd)    mmx_r2r (pcmpeqb, regs, regd)
-#define    pcmpeqd_m2r(var,reg)    mmx_m2r (pcmpeqd, var, reg)
-#define    pcmpeqd_r2r(regs,regd)    mmx_r2r (pcmpeqd, regs, regd)
-#define    pcmpeqw_m2r(var,reg)    mmx_m2r (pcmpeqw, var, reg)
-#define    pcmpeqw_r2r(regs,regd)    mmx_r2r (pcmpeqw, regs, regd)
-
-#define    pcmpgtb_m2r(var,reg)    mmx_m2r (pcmpgtb, var, reg)
-#define    pcmpgtb_r2r(regs,regd)    mmx_r2r (pcmpgtb, regs, regd)
-#define    pcmpgtd_m2r(var,reg)    mmx_m2r (pcmpgtd, var, reg)
-#define    pcmpgtd_r2r(regs,regd)    mmx_r2r (pcmpgtd, regs, regd)
-#define    pcmpgtw_m2r(var,reg)    mmx_m2r (pcmpgtw, var, reg)
-#define    pcmpgtw_r2r(regs,regd)    mmx_r2r (pcmpgtw, regs, regd)
-
-#define    pmaddwd_m2r(var,reg)    mmx_m2r (pmaddwd, var, reg)
-#define    pmaddwd_r2r(regs,regd)    mmx_r2r (pmaddwd, regs, regd)
-
-#define    pmulhw_m2r(var,reg)    mmx_m2r (pmulhw, var, reg)
-#define    pmulhw_r2r(regs,regd)    mmx_r2r (pmulhw, regs, regd)
-
-#define    pmullw_m2r(var,reg)    mmx_m2r (pmullw, var, reg)
-#define    pmullw_r2r(regs,regd)    mmx_r2r (pmullw, regs, regd)
-
-#define    por_m2r(var,reg)    mmx_m2r (por, var, reg)
-#define    por_r2r(regs,regd)    mmx_r2r (por, regs, regd)
-
-#define    pslld_i2r(imm,reg)    mmx_i2r (pslld, imm, reg)
-#define    pslld_m2r(var,reg)    mmx_m2r (pslld, var, reg)
-#define    pslld_r2r(regs,regd)    mmx_r2r (pslld, regs, regd)
-#define    psllq_i2r(imm,reg)    mmx_i2r (psllq, imm, reg)
-#define    psllq_m2r(var,reg)    mmx_m2r (psllq, var, reg)
-#define    psllq_r2r(regs,regd)    mmx_r2r (psllq, regs, regd)
-#define    psllw_i2r(imm,reg)    mmx_i2r (psllw, imm, reg)
-#define    psllw_m2r(var,reg)    mmx_m2r (psllw, var, reg)
-#define    psllw_r2r(regs,regd)    mmx_r2r (psllw, regs, regd)
-
-#define    psrad_i2r(imm,reg)    mmx_i2r (psrad, imm, reg)
-#define    psrad_m2r(var,reg)    mmx_m2r (psrad, var, reg)
-#define    psrad_r2r(regs,regd)    mmx_r2r (psrad, regs, regd)
-#define    psraw_i2r(imm,reg)    mmx_i2r (psraw, imm, reg)
-#define    psraw_m2r(var,reg)    mmx_m2r (psraw, var, reg)
-#define    psraw_r2r(regs,regd)    mmx_r2r (psraw, regs, regd)
-
-#define    psrld_i2r(imm,reg)    mmx_i2r (psrld, imm, reg)
-#define    psrld_m2r(var,reg)    mmx_m2r (psrld, var, reg)
-#define    psrld_r2r(regs,regd)    mmx_r2r (psrld, regs, regd)
-#define    psrlq_i2r(imm,reg)    mmx_i2r (psrlq, imm, reg)
-#define    psrlq_m2r(var,reg)    mmx_m2r (psrlq, var, reg)
-#define    psrlq_r2r(regs,regd)    mmx_r2r (psrlq, regs, regd)
-#define    psrlw_i2r(imm,reg)    mmx_i2r (psrlw, imm, reg)
-#define    psrlw_m2r(var,reg)    mmx_m2r (psrlw, var, reg)
-#define    psrlw_r2r(regs,regd)    mmx_r2r (psrlw, regs, regd)
-
-#define    psubb_m2r(var,reg)    mmx_m2r (psubb, var, reg)
-#define    psubb_r2r(regs,regd)    mmx_r2r (psubb, regs, regd)
-#define    psubd_m2r(var,reg)    mmx_m2r (psubd, var, reg)
-#define    psubd_r2r(regs,regd)    mmx_r2r (psubd, regs, regd)
-#define    psubw_m2r(var,reg)    mmx_m2r (psubw, var, reg)
-#define    psubw_r2r(regs,regd)    mmx_r2r (psubw, regs, regd)
-
-#define    psubsb_m2r(var,reg)    mmx_m2r (psubsb, var, reg)
-#define    psubsb_r2r(regs,regd)    mmx_r2r (psubsb, regs, regd)
-#define    psubsw_m2r(var,reg)    mmx_m2r (psubsw, var, reg)
-#define    psubsw_r2r(regs,regd)    mmx_r2r (psubsw, regs, regd)
-
-#define    psubusb_m2r(var,reg)    mmx_m2r (psubusb, var, reg)
-#define    psubusb_r2r(regs,regd)    mmx_r2r (psubusb, regs, regd)
-#define    psubusw_m2r(var,reg)    mmx_m2r (psubusw, var, reg)
-#define    psubusw_r2r(regs,regd)    mmx_r2r (psubusw, regs, regd)
-
-#define    punpckhbw_m2r(var,reg)        mmx_m2r (punpckhbw, var, reg)
-#define    punpckhbw_r2r(regs,regd)    mmx_r2r (punpckhbw, regs, regd)
-#define    punpckhdq_m2r(var,reg)        mmx_m2r (punpckhdq, var, reg)
-#define    punpckhdq_r2r(regs,regd)    mmx_r2r (punpckhdq, regs, regd)
-#define    punpckhwd_m2r(var,reg)        mmx_m2r (punpckhwd, var, reg)
-#define    punpckhwd_r2r(regs,regd)    mmx_r2r (punpckhwd, regs, regd)
-
-#define    punpcklbw_m2r(var,reg)         mmx_m2r (punpcklbw, var, reg)
-#define    punpcklbw_r2r(regs,regd)    mmx_r2r (punpcklbw, regs, regd)
-#define    punpckldq_m2r(var,reg)        mmx_m2r (punpckldq, var, reg)
-#define    punpckldq_r2r(regs,regd)    mmx_r2r (punpckldq, regs, regd)
-#define    punpcklwd_m2r(var,reg)        mmx_m2r (punpcklwd, var, reg)
-#define    punpcklwd_r2r(regs,regd)    mmx_r2r (punpcklwd, regs, regd)
-
-#define    pxor_m2r(var,reg)    mmx_m2r (pxor, var, reg)
-#define    pxor_r2r(regs,regd)    mmx_r2r (pxor, regs, regd)
-
-
-/* AMD MMX extensions - also available in intel SSE */
-
-
-#define mmx_m2ri(op,mem,reg,imm) \
-        __asm__ __volatile__ (#op " %1, %0, %%" #reg \
-                              : /* nothing */ \
-                              : "X" (mem), "X" (imm) \
-                              : #reg)
-#define mmx_r2ri(op,regs,regd,imm) \
-        __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
-                              : /* nothing */ \
-                              : "X" (imm) \
-                              : #regd)
-
-#define    mmx_fetch(mem,hint) \
-    __asm__ __volatile__ ("prefetch" #hint " %0" \
-                  : /* nothing */ \
-                  : "X" (mem))
-
-
-#define    maskmovq(regs,maskreg)        mmx_r2ri (maskmovq, regs, maskreg)
-
-#define    movntq_r2m(mmreg,var)        mmx_r2m (movntq, mmreg, var)
-
-#define    pavgb_m2r(var,reg)        mmx_m2r (pavgb, var, reg)
-#define    pavgb_r2r(regs,regd)        mmx_r2r (pavgb, regs, regd)
-#define    pavgw_m2r(var,reg)        mmx_m2r (pavgw, var, reg)
-#define    pavgw_r2r(regs,regd)        mmx_r2r (pavgw, regs, regd)
-
-#define    pextrw_r2r(mmreg,reg,imm)    mmx_r2ri (pextrw, mmreg, reg, imm)
-
-#define    pinsrw_r2r(reg,mmreg,imm)    mmx_r2ri (pinsrw, reg, mmreg, imm)
-
-#define    pmaxsw_m2r(var,reg)        mmx_m2r (pmaxsw, var, reg)
-#define    pmaxsw_r2r(regs,regd)        mmx_r2r (pmaxsw, regs, regd)
-
-#define    pmaxub_m2r(var,reg)        mmx_m2r (pmaxub, var, reg)
-#define    pmaxub_r2r(regs,regd)        mmx_r2r (pmaxub, regs, regd)
-
-#define    pminsw_m2r(var,reg)        mmx_m2r (pminsw, var, reg)
-#define    pminsw_r2r(regs,regd)        mmx_r2r (pminsw, regs, regd)
-
-#define    pminub_m2r(var,reg)        mmx_m2r (pminub, var, reg)
-#define    pminub_r2r(regs,regd)        mmx_r2r (pminub, regs, regd)
-
-#define    pmovmskb(mmreg,reg) \
-    __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg : : : #reg)
-
-#define    pmulhuw_m2r(var,reg)        mmx_m2r (pmulhuw, var, reg)
-#define    pmulhuw_r2r(regs,regd)        mmx_r2r (pmulhuw, regs, regd)
-
-#define    prefetcht0(mem)            mmx_fetch (mem, t0)
-#define    prefetcht1(mem)            mmx_fetch (mem, t1)
-#define    prefetcht2(mem)            mmx_fetch (mem, t2)
-#define    prefetchnta(mem)        mmx_fetch (mem, nta)
-
-#define    psadbw_m2r(var,reg)        mmx_m2r (psadbw, var, reg)
-#define    psadbw_r2r(regs,regd)        mmx_r2r (psadbw, regs, regd)
-
-#define    pshufw_m2r(var,reg,imm)        mmx_m2ri(pshufw, var, reg, imm)
-#define    pshufw_r2r(regs,regd,imm)    mmx_r2ri(pshufw, regs, regd, imm)
-
-#define    sfence() __asm__ __volatile__ ("sfence\n\t")


=====================================
modules/video_filter/deinterlace/yadif.h
=====================================
@@ -97,6 +97,3 @@ static void yadif_filter_line_c_16bit(uint8_t *dst8, uint8_t *prev8, uint8_t *cu
 void vlcpriv_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
 void vlcpriv_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
 #endif
-#if defined(__i386__)
-void vlcpriv_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
-#endif


=====================================
modules/video_filter/deinterlace/yadif_x86.asm
=====================================
@@ -248,9 +248,6 @@ cglobal yadif_filter_line, 4, 7, 8,  80, dst, prev, cur, next, w, prefs, \
     FILTER 0, curq, nextq
 
 .ret:
-%if mmsize == 8
-    emms
-%endif
     RET
 %if ARCH_X86_32
   %undef pb_1
@@ -262,7 +259,3 @@ INIT_XMM ssse3
 YADIF
 INIT_XMM sse2
 YADIF
-%if ARCH_X86_32
-INIT_MMX mmxext
-YADIF
-%endif



View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/e6bb48cc1568fdebabba810718c8c235abe0b4ab...dd38fdc4f288f0ae24010ca770e6a1b97949274b

-- 
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/e6bb48cc1568fdebabba810718c8c235abe0b4ab...dd38fdc4f288f0ae24010ca770e6a1b97949274b
You're receiving this email because of your account on code.videolan.org.




More information about the vlc-commits mailing list