[vlc-devel] [PATCH 8/19] deinterlace: convert MMXEXT only accelerations to SSE2

Thu Sep 24 21:38:10 CEST 2020

From: Lyndon Brown <jnqnfe at gmail.com>
Date: Sat, 26 Jan 2019 08:51:36 +0000
Subject: deinterlace: convert MMXEXT only accelerations to SSE2

(prep work for purging MMX/MMXEXT)

the code changed here has no greater acceleration than MMX/MMXEXT, so to
prepare for the MMX/MMXEXT purge, convert this to SSE2

note, this is a basic conversion only, *using* SSE2 vector registers (Xmm)
but not actually taking advantage of their greater width; that may
require significant revision to the code and thus is left for a separate
commit (and/or person) to possibly tackle later. note the following
benefits, vs. leaving as it was:
 - switching to SSE registers avoids the MMX<->FP register clash
 - thus it avoids the need for issuing `emms` instructions
 - with a little more work, could take full advantage of these wider registers
 - allows us to proceed with the purge of all old MMX/MMXEXT code

also, I inlined the asm, removing dependency on the abstraction macro set

diff --git a/modules/video_filter/deinterlace/algo_phosphor.c b/modules/video_filter/deinterlace/algo_phosphor.c
index 289eed783b..2223f54e8e 100644
--- a/modules/video_filter/deinterlace/algo_phosphor.c
+++ b/modules/video_filter/deinterlace/algo_phosphor.c
@@ -24,8 +24,7 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
+#ifdef CAN_COMPILE_SSE
 #   include <stdalign.h>
 #endif
 
@@ -87,7 +86,7 @@ static void DarkenField( picture_t *p_dst,
        For luma, the operation is just a shift + bitwise AND, so we vectorize
        even in the C version.
 
-       There is an MMX version too, because it performs about twice faster.
+       There are SIMD versions too, which perform significantly faster.
     */
     int i_plane = Y_PLANE;
     uint8_t *p_out, *p_out_end;
@@ -120,7 +119,7 @@ static void DarkenField( picture_t *p_dst,
 
        The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
        The chroma processing is a bit more complicated than luma,
-       and needs MMX for vectorization.
+       and needs SIMD for vectorization.
     */
     if( process_chroma )
     {
@@ -148,9 +147,11 @@ static void DarkenField( picture_t *p_dst,
     } /* if process_chroma */
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static void DarkenFieldMMX( picture_t *p_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static void DarkenFieldSSE( picture_t *p_dst,
                             const int i_field, const int i_strength,
                             bool process_chroma )
 {
@@ -181,16 +182,22 @@ static void DarkenFieldMMX( picture_t *p_dst,
         uint64_t *po = (uint64_t *)p_out;
         int x = 0;
 
-        movq_m2r( i_strength_u64,  mm1 );
-        movq_m2r( remove_high_u64, mm2 );
+        __asm__ volatile (
+            "movq %0, %%xmm1\n"
+            "movq %1, %%xmm2\n"
+            :: "m" (i_strength_u64), "m" (remove_high_u64)
+            : "xmm1", "xmm2"
+        );
         for( ; x < w8; x += 8 )
         {
-            movq_m2r( (*po), mm0 );
-
-            psrlq_r2r( mm1, mm0 );
-            pand_r2r(  mm2, mm0 );
-
-            movq_r2m( mm0, (*po++) );
+            __asm__ volatile (
+                "movq %0, %%xmm0\n"
+                "psrlq %%xmm1, %%xmm0\n"
+                "pand %%xmm2, %%xmm0\n"
+                "movq %%xmm0, %0\n"
+                : "=m" (*po) :: "xmm0", "memory"
+            );
+            po++;
         }
 
         /* handle the width remainder */
@@ -228,35 +235,42 @@ static void DarkenFieldMMX( picture_t *p_dst,
                 int x = 0;
 
                 /* See also easy-to-read C version below. */
-                static alignas (8) const mmx_t b128 = {
-                    .uq = 0x8080808080808080ULL
-                };
+                const uint64_t b128 =  0x8080808080808080ULL;
 
-                movq_m2r( b128, mm5 );
-                movq_m2r( i_strength_u64,  mm6 );
-                movq_m2r( remove_high_u64, mm7 );
+                __asm__ volatile (
+                    "movq %0, %%xmm5\n"
+                    "movq %1, %%xmm6\n"
+                    "movq %2, %%xmm7\n"
+                    :: "m" (b128), "m" (i_strength_u64), "m" (remove_high_u64)
+                    : "xmm5", "xmm6", "xmm7"
+                );
 
                 uint64_t *po8 = (uint64_t *)p_out;
                 for( ; x < w8; x += 8 )
                 {
-                    movq_m2r( (*po8), mm0 );
+                    __asm__ volatile (
+                        "movq %0, %%xmm0\n"
 
-                    movq_r2r( mm5, mm2 ); /* 128 */
-                    movq_r2r( mm0, mm1 ); /* copy of data */
-                    psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
-                    psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
+                        "movq %%xmm5, %%xmm2\n" /* 128 */
+                        "movq %%xmm0, %%xmm1\n" /* copy of data */
+                        "psubusb %%xmm2, %%xmm1\n" /* xmm1 = max(data - 128, 0) */
+                        "psubusb %%xmm0, %%xmm2\n" /* xmm2 = max(128 - data, 0) */
 
-                    /* >> i_strength */
-                    psrlq_r2r( mm6, mm1 );
-                    psrlq_r2r( mm6, mm2 );
-                    pand_r2r(  mm7, mm1 );
-                    pand_r2r(  mm7, mm2 );
+                        /* >> i_strength */
+                        "psrlq %%xmm6, %%xmm1\n"
+                        "psrlq %%xmm6, %%xmm2\n"
+                        "pand %%xmm7, %%xmm1\n"
+                        "pand %%xmm7, %%xmm2\n"
 
-                    /* collect results from pos./neg. parts */
-                    psubb_r2r( mm2, mm1 );
-                    paddb_r2r( mm5, mm1 );
+                        /* collect results from pos./neg. parts */
+                        "psubb %%xmm2, %%xmm1\n"
+                        "paddb %%xmm5, %%xmm1\n"
 
-                    movq_r2m( mm1, (*po8++) );
+                        "movq %%xmm1, %0\n"
+
+                        : "=m" (*po8) :: "xmm0", "xmm1", "xmm2", "memory"
+                    );
+                    po8++;
                 }
 
                 /* C version - handle the width remainder */
@@ -266,8 +280,6 @@ static void DarkenFieldMMX( picture_t *p_dst,
             } /* for p_out... */
         } /* for i_plane... */
     } /* if process_chroma */
-
-    emms();
 }
 #endif
 
@@ -357,9 +369,9 @@ int RenderPhosphor( filter_t *p_filter,
     */
     if( p_sys->phosphor.i_dimmer_strength > 0 )
     {
-#ifdef CAN_COMPILE_MMXEXT
-        if( vlc_CPU_MMXEXT() )
-            DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
+#ifdef CAN_COMPILE_SSE
+        if( vlc_CPU_SSE2() )
+            DarkenFieldSSE( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
                 p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
                 p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
         else
diff --git a/modules/video_filter/deinterlace/algo_x.c b/modules/video_filter/deinterlace/algo_x.c
index 411351d1cd..09cbbc0acd 100644
--- a/modules/video_filter/deinterlace/algo_x.c
+++ b/modules/video_filter/deinterlace/algo_x.c
@@ -24,10 +24,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #include <stdint.h>
 
 #include <vlc_common.h>
@@ -76,9 +72,13 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
 
     return fc < 1 ? false : true;
 }
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
+
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. Would that require
+   migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline int XDeint8x8DetectSSE( uint8_t *src, int i_src )
 {
 
     int y, x;
@@ -87,51 +87,66 @@ static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
 
     /* Detect interlacing */
     fc = 0;
-    pxor_r2r( mm7, mm7 );
+    __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
     for( y = 0; y < 9; y += 2 )
     {
         ff = fr = 0;
-        pxor_r2r( mm5, mm5 );
-        pxor_r2r( mm6, mm6 );
+        __asm__ volatile (
+            "pxor %%xmm5, %%xmm5\n"
+            "pxor %%xmm6, %%xmm6\n"
+            ::: "xmm5", "xmm6"
+        );
         for( x = 0; x < 8; x+=4 )
         {
-            movd_m2r( src[        x], mm0 );
-            movd_m2r( src[1*i_src+x], mm1 );
-            movd_m2r( src[2*i_src+x], mm2 );
-            movd_m2r( src[3*i_src+x], mm3 );
-
-            punpcklbw_r2r( mm7, mm0 );
-            punpcklbw_r2r( mm7, mm1 );
-            punpcklbw_r2r( mm7, mm2 );
-            punpcklbw_r2r( mm7, mm3 );
-
-            movq_r2r( mm0, mm4 );
-
-            psubw_r2r( mm1, mm0 );
-            psubw_r2r( mm2, mm4 );
-
-            psubw_r2r( mm1, mm2 );
-            psubw_r2r( mm1, mm3 );
-
-            pmaddwd_r2r( mm0, mm0 );
-            pmaddwd_r2r( mm4, mm4 );
-            pmaddwd_r2r( mm2, mm2 );
-            pmaddwd_r2r( mm3, mm3 );
-            paddd_r2r( mm0, mm2 );
-            paddd_r2r( mm4, mm3 );
-            paddd_r2r( mm2, mm5 );
-            paddd_r2r( mm3, mm6 );
+            __asm__ volatile (
+                "movd %0, %%xmm0\n"
+                "movd %1, %%xmm1\n"
+                "movd %2, %%xmm2\n"
+                "movd %3, %%xmm3\n"
+
+                "punpcklbw %%xmm7, %%xmm0\n"
+                "punpcklbw %%xmm7, %%xmm1\n"
+                "punpcklbw %%xmm7, %%xmm2\n"
+                "punpcklbw %%xmm7, %%xmm3\n"
+
+                "movq %%xmm0, %%xmm4\n"
+
+                "psubw %%xmm2, %%xmm4\n"
+                "psubw %%xmm1, %%xmm0\n"
+                "psubw %%xmm1, %%xmm2\n"
+                "psubw %%xmm1, %%xmm3\n"
+
+                "pmaddwd %%xmm0, %%xmm0\n"
+                "pmaddwd %%xmm2, %%xmm2\n"
+                "pmaddwd %%xmm3, %%xmm3\n"
+                "pmaddwd %%xmm4, %%xmm4\n"
+                "paddd %%xmm0, %%xmm2\n"
+                "paddd %%xmm4, %%xmm3\n"
+                "paddd %%xmm2, %%xmm5\n"
+                "paddd %%xmm3, %%xmm6\n"
+
+                :: "m" (src[        x]),
+                   "m" (src[1*i_src+x]),
+                   "m" (src[2*i_src+x]),
+                   "m" (src[3*i_src+x])
+                : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+            );
         }
 
-        movq_r2r( mm5, mm0 );
-        psrlq_i2r( 32, mm0 );
-        paddd_r2r( mm0, mm5 );
-        movd_r2m( mm5, fr );
+        __asm__ volatile (
+            "movq %%xmm5, %%xmm0\n"
+            "psrlq $32, %%xmm0\n"
+            "paddd %%xmm0, %%xmm5\n"
+            "movd %%xmm5, %0\n"
 
-        movq_r2r( mm6, mm0 );
-        psrlq_i2r( 32, mm0 );
-        paddd_r2r( mm0, mm6 );
-        movd_r2m( mm6, ff );
+            "movq %%xmm6, %%xmm0\n"
+            "psrlq $32, %%xmm0\n"
+            "paddd %%xmm0, %%xmm6\n"
+            "movd %%xmm6, %1\n"
+
+            : "=m" (fr), "=m" (ff)
+            :: "xmm0", "xmm5", "xmm6", "memory"
+        );
 
         if( ff < 6*fr/8 && fr > 32 )
             fc++;
@@ -163,9 +178,12 @@ static inline void XDeint8x8MergeC( uint8_t *dst,  int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. Would that require
+   migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8MergeSSE( uint8_t *dst,  int i_dst,
                                          uint8_t *src1, int i_src1,
                                          uint8_t *src2, int i_src2 )
 {
@@ -173,37 +191,47 @@ static inline void XDeint8x8MergeMMXEXT( uint8_t *dst,  int i_dst,
     int y, x;
 
     /* Progressive */
-    pxor_r2r( mm7, mm7 );
+    __asm__ volatile (
+        "pxor %%xmm7, %%xmm7\n"
+        "movq %0, %%xmm6\n"
+        :: "m" (m_4) : "xmm6", "xmm7"
+    );
     for( y = 0; y < 8; y += 2 )
     {
         for( x = 0; x < 8; x +=4 )
         {
-            movd_m2r( src1[x], mm0 );
-            movd_r2m( mm0, dst[x] );
-
-            movd_m2r( src2[x], mm1 );
-            movd_m2r( src1[i_src1+x], mm2 );
-
-            punpcklbw_r2r( mm7, mm0 );
-            punpcklbw_r2r( mm7, mm1 );
-            punpcklbw_r2r( mm7, mm2 );
-            paddw_r2r( mm1, mm1 );
-            movq_r2r( mm1, mm3 );
-            paddw_r2r( mm3, mm3 );
-            paddw_r2r( mm2, mm0 );
-            paddw_r2r( mm3, mm1 );
-            paddw_m2r( m_4, mm1 );
-            paddw_r2r( mm1, mm0 );
-            psraw_i2r( 3, mm0 );
-            packuswb_r2r( mm7, mm0 );
-            movd_r2m( mm0, dst[i_dst+x] );
+            __asm__ volatile (
+                "movd %2, %%xmm0\n"
+                "movd %%xmm0, %0\n"
+
+                "movd %3, %%xmm1\n"
+                "movd %4, %%xmm2\n"
+
+                "punpcklbw %%xmm7, %%xmm0\n"
+                "punpcklbw %%xmm7, %%xmm1\n"
+                "punpcklbw %%xmm7, %%xmm2\n"
+                "paddw %%xmm1, %%xmm1\n"
+                "movq %%xmm1, %%xmm3\n"
+                "paddw %%xmm3, %%xmm3\n"
+                "paddw %%xmm2, %%xmm0\n"
+                "paddw %%xmm3, %%xmm1\n"
+                "paddw %%xmm6, %%xmm1\n"
+                "paddw %%xmm1, %%xmm0\n"
+                "psraw $3, %%xmm0\n"
+                "packuswb %%xmm7, %%xmm0\n"
+                "movd %%xmm0, %1\n"
+
+                : "=m" (dst[x]), "=m" (dst[i_dst+x])
+                : "m" (src1[x]), "m" (src2[x]), "m" (src1[i_src1+x])
+                : "xmm0", "xmm1", "xmm2", "xmm3",
+                  "memory"
+            );
         }
         dst += 2*i_dst;
         src1 += i_src1;
         src2 += i_src2;
     }
 }
-
 #endif
 
 /* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
@@ -229,9 +257,12 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. Would that require
+   migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8FieldESSE( uint8_t *dst, int i_dst,
                                           uint8_t *src, int i_src )
 {
     int y;
@@ -239,14 +270,21 @@ static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
     /* Interlaced */
     for( y = 0; y < 8; y += 2 )
     {
-        movq_m2r( src[0], mm0 );
-        movq_r2m( mm0, dst[0] );
+        __asm__ volatile (
+            "movq %1, %%xmm0\n"
+            "movq %%xmm0, %0\n"
+            : "=m" (dst[0]) : "m" (src[0])
+            : "xmm0", "memory"
+        );
         dst += i_dst;
 
-        movq_m2r( src[2*i_src], mm1 );
-        pavgb_r2r( mm1, mm0 );
-
-        movq_r2m( mm0, dst[0] );
+        __asm__ volatile (
+            "movq %1, %%xmm1\n"
+            "pavgb %%xmm1, %%xmm0\n"
+            "movq %%xmm0, %0\n"
+            : "=m" (dst[0]) : "m" (src[2*i_src])
+            : "xmm0", "xmm1", "memory"
+        );
 
         dst += 1*i_dst;
         src += 2*i_src;
@@ -301,9 +339,12 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
     }
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. Would that require
+   migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8FieldSSE( uint8_t *dst, int i_dst,
                                          uint8_t *src, int i_src )
 {
     int y, x;
@@ -319,17 +360,24 @@ static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
             uint8_t *src2 = &src[2*i_src];
             int32_t c0, c1, c2;
 
-            movq_m2r( src[x-2], mm0 );
-            movq_m2r( src[x-3], mm1 );
-            movq_m2r( src[x-4], mm2 );
-
-            psadbw_m2r( src2[x-4], mm0 );
-            psadbw_m2r( src2[x-3], mm1 );
-            psadbw_m2r( src2[x-2], mm2 );
-
-            movd_r2m( mm0, c2 );
-            movd_r2m( mm1, c1 );
-            movd_r2m( mm2, c0 );
+            __asm__ volatile (
+                "movq %3, %%xmm0\n"
+                "movq %4, %%xmm1\n"
+                "movq %5, %%xmm2\n"
+                "movq %6, %%xmm3\n"
+                "movq %7, %%xmm4\n"
+                "movq %8, %%xmm5\n"
+                "psadbw %%xmm3, %%xmm0\n"
+                "psadbw %%xmm4, %%xmm1\n"
+                "psadbw %%xmm5, %%xmm2\n"
+                "movd %%xmm0, %2\n"
+                "movd %%xmm1, %1\n"
+                "movd %%xmm2, %0\n"
+                : "=m" (c0), "=m" (c1), "=m" (c2)
+                : "m" (src[x-2]), "m" (src[x-3]), "m" (src[x-4]),
+                  "m" (src2[x-4]), "m" (src2[x-3]), "m" (src2[x-2])
+                : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory"
+            );
 
             if( c0 < c1 && c1 <= c2 )
                 dst[x] = (src[x-1] + src2[x+1]) >> 1;
@@ -472,9 +520,9 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
         XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeintBand8x8SSE( uint8_t *dst, int i_dst,
                                         uint8_t *src, int i_src,
                                         const int i_mbx, int i_modx )
 {
@@ -484,16 +532,16 @@ static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
     for( x = 0; x < i_mbx; x++ )
     {
         int s;
-        if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
+        if( ( s = XDeint8x8DetectSSE( src, i_src ) ) )
         {
             if( x == 0 || x == i_mbx - 1 )
-                XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
+                XDeint8x8FieldESSE( dst, i_dst, src, i_src );
             else
-                XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
+                XDeint8x8FieldSSE( dst, i_dst, src, i_src );
         }
         else
         {
-            XDeint8x8MergeMMXEXT( dst, i_dst,
+            XDeint8x8MergeSSE( dst, i_dst,
                                   &src[0*i_src], 2*i_src,
                                   &src[1*i_src], 2*i_src );
         }
@@ -515,8 +563,8 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
 {
     VLC_UNUSED(p_filter);
     int i_plane;
-#if defined (CAN_COMPILE_MMXEXT)
-    const bool mmxext = vlc_CPU_MMXEXT();
+#if defined (CAN_COMPILE_SSE)
+    const bool sse = vlc_CPU_SSE2();
 #endif
 
     /* Copy image and skip lines */
@@ -538,9 +586,9 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
             uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
             uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
 
-#ifdef CAN_COMPILE_MMXEXT
-            if( mmxext )
-                XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
+#ifdef CAN_COMPILE_SSE
+            if( sse )
+                XDeintBand8x8SSE( dst, i_dst, src, i_src, i_mbx, i_modx );
             else
 #endif
                 XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
@@ -565,9 +613,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
         }
     }
 
-#ifdef CAN_COMPILE_MMXEXT
-    if( mmxext )
-        emms();
-#endif
     return VLC_SUCCESS;
 }
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index a882a408c1..759164c32c 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -24,8 +24,7 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
+#ifdef CAN_COMPILE_SSE
 #   include <stdalign.h>
 #endif
 
@@ -107,9 +106,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
  * For interpretation of pi_top and pi_bot, it is assumed that the block
  * starts on an even-numbered line (belonging to the top field).
  *
- * The b_mmx parameter avoids the need to call vlc_CPU() separately
- * for each block.
- *
  * @param[in] p_pix_p Base pointer to the block in previous picture
  * @param[in] p_pix_c Base pointer to the same block in current picture
  * @param i_pitch_prev i_pitch of previous picture
@@ -173,9 +169,11 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
     return (i_motion >= 8);
 }
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static int TestForMotionInBlockSSE( uint8_t *p_pix_p, uint8_t *p_pix_c,
                                     int i_pitch_prev, int i_pitch_curr,
                                     int* pi_top, int* pi_bot )
 {
@@ -183,63 +181,81 @@ static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
     int32_t i_top_motion = 0;
     int32_t i_bot_motion = 0;
 
-    static alignas (8) const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
-    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
-    movq_m2r( bT,  mm5 );
+    const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
+    __asm__ volatile (
+        "pxor %%xmm6, %%xmm6\n" /* zero, used in psadbw */
+        "movq %0, %%xmm5\n"
+
+        "pxor %%xmm3, %%xmm3\n" /* score (top field) */
+        "pxor %%xmm4, %%xmm4\n" /* score (bottom field) */
 
-    pxor_r2r( mm3, mm3 ); /* score (top field) */
-    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
+        :: "m" (bT) : "xmm3", "xmm4", "xmm5", "xmm6"
+    );
     for( int y = 0; y < 8; y+=2 )
     {
         /* top field */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
+        __asm__ volatile (
+            "movq %0, %%xmm0\n"
+            "movq %1, %%xmm1\n"
+            "movq %%xmm0, %%xmm2\n"
+            "psubusb %%xmm1, %%xmm2\n"
+            "psubusb %%xmm0, %%xmm1\n"
 
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
+            "pcmpgtb %%xmm5, %%xmm2\n"
+            "pcmpgtb %%xmm5, %%xmm1\n"
+            "psadbw %%xmm6, %%xmm2\n"
+            "psadbw %%xmm6, %%xmm1\n"
 
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm3 ); /* add to top field score */
+            "paddd %%xmm2, %%xmm1\n"
+            "paddd %%xmm1, %%xmm3\n" /* add to top field score */
+
+            :: "m" (*((uint64_t*)p_pix_c)), "m" (*((uint64_t*)p_pix_p))
+            : "xmm0", "xmm1", "xmm2", "xmm3"
+        );
 
         p_pix_c += i_pitch_curr;
         p_pix_p += i_pitch_prev;
 
         /* bottom field - handling identical to top field, except... */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
-
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
-
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
+        __asm__ volatile (
+            /* top field */
+            "movq %0, %%xmm0\n"
+            "movq %1, %%xmm1\n"
+            "movq %%xmm0, %%xmm2\n"
+            "psubusb %%xmm1, %%xmm2\n"
+            "psubusb %%xmm0, %%xmm1\n"
+
+            "pcmpgtb %%xmm5, %%xmm2\n"
+            "pcmpgtb %%xmm5, %%xmm1\n"
+            "psadbw %%xmm6, %%xmm2\n"
+            "psadbw %%xmm6, %%xmm1\n"
+
+            "paddd %%xmm2, %%xmm1\n"
+            "paddd %%xmm1, %%xmm4\n" /* ...here we add to bottom field score */
+
+            :: "m" (*((uint64_t*)p_pix_c)), "m" (*((uint64_t*)p_pix_p))
+            : "xmm0", "xmm1", "xmm2", "xmm4"
+        );
 
         p_pix_c += i_pitch_curr;
         p_pix_p += i_pitch_prev;
     }
-    movq_r2r(  mm3, mm7 ); /* score (total) */
-    paddd_r2r( mm4, mm7 );
-    movd_r2m( mm3, i_top_motion );
-    movd_r2m( mm4, i_bot_motion );
-    movd_r2m( mm7, i_motion );
+    __asm__ volatile (
+        "movq %%xmm3, %%xmm7\n" /* score (total) */
+        "paddd %%xmm4, %%xmm7\n"
+        "movd %%xmm3, %0\n"
+        "movd %%xmm4, %1\n"
+        "movd %%xmm7, %2\n"
+
+        : "=m" (i_top_motion), "=m" (i_bot_motion), "=m" (i_motion)
+        :: "xmm7", "memory"
+    );
 
     /* The loop counts actual score * 255. */
     i_top_motion /= 255;
     i_bot_motion /= 255;
     i_motion     /= 255;
 
-    emms();
-
     (*pi_top) = ( i_top_motion >= 8 );
     (*pi_bot) = ( i_bot_motion >= 8 );
     return (i_motion >= 8);
@@ -396,10 +412,10 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 
     int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
         TestForMotionInBlock;
-    /* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
-    if (vlc_CPU_MMXEXT())
-        motion_in_block = TestForMotionInBlockMMX;
+    /* We must tell our inline helper whether to use SSE2 acceleration. */
+#ifdef CAN_COMPILE_SSE
+    if (vlc_CPU_SSE2())
+        motion_in_block = TestForMotionInBlockSSE;
 #endif
 
     int i_score = 0;
@@ -451,19 +467,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 /* Threshold (value from Transcode 1.1.5) */
 #define T 100
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+   without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
                                        const picture_t* p_pic_bot )
 {
     assert( p_pic_top->i_planes == p_pic_bot->i_planes );
 
-    /* Amount of bits must be known for MMX, thus int32_t.
+    /* Amount of bits must be known for SSE, thus int32_t.
        Doesn't hurt the C implementation. */
-    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
-    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */
+    int32_t i_score_sse = 0; /* this must be divided by 255 when finished  */
+    int32_t i_score_c   = 0; /* this counts as-is (used for non-SSE parts) */
 
-    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
+    /* we will keep score in mm7 */
+    __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
 
     for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
     {
@@ -502,43 +521,51 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
                             # of pixels < (2^32)/255
                Note: calculates score * 255
             */
-            static alignas (8) const mmx_t b0 = {
-                .uq = 0x0000000000000000ULL };
-            static alignas (8) const mmx_t b128 = {
-                .uq = 0x8080808080808080ULL };
-            static alignas (8) const mmx_t bT = {
-                .ub = { T, T, T, T, T, T, T, T } };
+            const uint64_t b128 = 0x8080808080808080ULL;
+            const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
 
             for( ; x < w8; x += 8 )
             {
-                movq_m2r( *((int64_t*)p_c), mm0 );
-                movq_m2r( *((int64_t*)p_p), mm1 );
-                movq_m2r( *((int64_t*)p_n), mm2 );
-
-                psubb_m2r( b128, mm0 );
-                psubb_m2r( b128, mm1 );
-                psubb_m2r( b128, mm2 );
-
-                psubsb_r2r( mm0, mm1 );
-                psubsb_r2r( mm0, mm2 );
-
-                pxor_r2r( mm3, mm3 );
-                pxor_r2r( mm4, mm4 );
-                pxor_r2r( mm5, mm5 );
-                pxor_r2r( mm6, mm6 );
-
-                punpcklbw_r2r( mm1, mm3 );
-                punpcklbw_r2r( mm2, mm4 );
-                punpckhbw_r2r( mm1, mm5 );
-                punpckhbw_r2r( mm2, mm6 );
-
-                pmulhw_r2r( mm3, mm4 );
-                pmulhw_r2r( mm5, mm6 );
-
-                packsswb_r2r(mm4, mm6);
-                pcmpgtb_m2r( bT, mm6 );
-                psadbw_m2r( b0, mm6 );
-                paddd_r2r( mm6, mm7 );
+                __asm__ volatile (
+                    "movq %0, %%xmm0\n"
+                    "movq %1, %%xmm1\n"
+                    "movq %2, %%xmm2\n"
+
+                    "movq %3, %%xmm3\n"
+                    "psubb %%xmm3, %%xmm0\n"
+                    "psubb %%xmm3, %%xmm1\n"
+                    "psubb %%xmm3, %%xmm2\n"
+
+                    "psubsb %%xmm0, %%xmm1\n"
+                    "psubsb %%xmm0, %%xmm2\n"
+
+                    "pxor %%xmm3, %%xmm3\n"
+                    "pxor %%xmm4, %%xmm4\n"
+                    "pxor %%xmm5, %%xmm5\n"
+                    "pxor %%xmm6, %%xmm6\n"
+
+                    "punpcklbw %%xmm1, %%xmm3\n"
+                    "punpcklbw %%xmm2, %%xmm4\n"
+                    "punpckhbw %%xmm1, %%xmm5\n"
+                    "punpckhbw %%xmm2, %%xmm6\n"
+
+                    "pmulhw %%xmm3, %%xmm4\n"
+                    "pmulhw %%xmm5, %%xmm6\n"
+
+                    "movq %4, %%xmm0\n"
+                    "pxor %%xmm1, %%xmm1\n"
+
+                    "packsswb %%xmm4, %%xmm6\n"
+                    "pcmpgtb %%xmm0, %%xmm6\n"
+                    "psadbw %%xmm1, %%xmm6\n"
+                    "paddd %%xmm6, %%xmm7\n"
+
+                    :: "m" (*((int64_t*)p_c)),
+                       "m" (*((int64_t*)p_p)),
+                       "m" (*((int64_t*)p_n)),
+                       "m" (b128), "m" (bT)
+                    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+                );
 
                 p_c += 8;
                 p_p += 8;
@@ -580,10 +607,9 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
         }
     }
 
-    movd_r2m( mm7, i_score_mmx );
-    emms();
+    __asm__ volatile ("movd %%xmm7, %0\n" : "=m" (i_score_sse) :: "memory");
 
-    return i_score_mmx/255 + i_score_c;
+    return i_score_sse/255 + i_score_c;
 }
 #endif
 
@@ -607,9 +633,9 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
     if( p_pic_top->i_planes != p_pic_bot->i_planes )
         return -1;
 
-#ifdef CAN_COMPILE_MMXEXT
-    if (vlc_CPU_MMXEXT())
-        return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
+#ifdef CAN_COMPILE_SSE
+    if (vlc_CPU_SSE2())
+        return CalculateInterlaceScoreSSE( p_pic_top, p_pic_bot );
 #endif
 
     int32_t i_score = 0;