[vlc-devel] [PATCH 17/19] deinterlace: finish up SSE2 enhancement

Thu Sep 24 21:42:08 CEST 2020

From: Lyndon Brown <jnqnfe at gmail.com>
Date: Sat, 9 Mar 2019 06:45:22 +0000
Subject: deinterlace: finish up SSE2 enhancement

In a previous patch, bits of deinterlace that were MMX/MMXEXT accelerated
only were upgraded to SSE2 in preparation for a complete purge of
MMX/MMXEXT, however this upgrade was minimal, not going as far as actually
making proper use of the wider SSE2 registers; todo notes were left
with regard to this.

This commit addresses those todo notes.

With algo-x, the todos were simply removed; in that case using more of the
SSE2 registers would involve moving from the 8x8 processing model to a
16x16/8x16 one, which would then change the results of processing, per how
it determines which change to apply to each 8x8 block on the content of
each 8x8 block.

diff --git a/modules/video_filter/deinterlace/algo_phosphor.c b/modules/video_filter/deinterlace/algo_phosphor.c
index 2223f54e8e..e1ff0178ad 100644
--- a/modules/video_filter/deinterlace/algo_phosphor.c
+++ b/modules/video_filter/deinterlace/algo_phosphor.c
@@ -147,8 +147,6 @@ static void DarkenField( picture_t *p_dst,
     } /* if process_chroma */
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static void DarkenFieldSSE( picture_t *p_dst,
@@ -159,10 +157,8 @@ static void DarkenFieldSSE( picture_t *p_dst,
     assert( i_field == 0 || i_field == 1 );
     assert( i_strength >= 1 && i_strength <= 3 );
 
-    uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
     const uint8_t  remove_high_u8 = 0xFF >> i_strength;
-    const uint64_t remove_high_u64 = remove_high_u8 *
-                                            INT64_C(0x0101010101010101);
+    const uint32_t remove_high_u32 = remove_high_u8 * 0x01010101;
 
     int i_plane = Y_PLANE;
     uint8_t *p_out, *p_out_end;
@@ -175,26 +171,27 @@ static void DarkenFieldSSE( picture_t *p_dst,
     if( i_field == 1 )
         p_out += p_dst->p[i_plane].i_pitch;
 
-    int wm8 = w % 8;   /* remainder */
-    int w8  = w - wm8; /* part of width that is divisible by 8 */
+    int wm16 = w % 16;   /* remainder */
+    int w16  = w - wm16; /* part of width that is divisible by 16 */
     for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
     {
-        uint64_t *po = (uint64_t *)p_out;
+        uint8_t *po = p_out;
         int x = 0;
 
         __asm__ volatile (
-            "movq %0, %%xmm1\n"
-            "movq %1, %%xmm2\n"
-            :: "m" (i_strength_u64), "m" (remove_high_u64)
+            "movd %0, %%xmm1\n"
+            "movd %1, %%xmm2\n"
+            "pshufd $0, %%xmm2, %%xmm2\n" /* duplicate 32-bits across reg */
+            :: "m" (i_strength), "m" (remove_high_u32)
             : "xmm1", "xmm2"
         );
-        for( ; x < w8; x += 8 )
+        for( ; x < w16; x += 16 )
         {
             __asm__ volatile (
-                "movq %0, %%xmm0\n"
+                "movdqu %0, %%xmm0\n"
                 "psrlq %%xmm1, %%xmm0\n"
                 "pand %%xmm2, %%xmm0\n"
-                "movq %%xmm0, %0\n"
+                "movdqu %%xmm0, %0\n"
                 : "=m" (*po) :: "xmm0", "memory"
             );
             po++;
@@ -210,7 +207,7 @@ static void DarkenFieldSSE( picture_t *p_dst,
 
        The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
        The chroma processing is a bit more complicated than luma,
-       and needs MMX for vectorization.
+       and needs SSE2 for vectorization.
     */
     if( process_chroma )
     {
@@ -219,8 +216,8 @@ static void DarkenFieldSSE( picture_t *p_dst,
              i_plane++ )
         {
             w = p_dst->p[i_plane].i_visible_pitch;
-            wm8 = w % 8;   /* remainder */
-            w8  = w - wm8; /* part of width that is divisible by 8 */
+            wm16 = w % 16;   /* remainder */
+            w16  = w - wm16; /* part of width that is divisible by 16 */
 
             p_out = p_dst->p[i_plane].p_pixels;
             p_out_end = p_out + p_dst->p[i_plane].i_pitch
@@ -234,25 +231,25 @@ static void DarkenFieldSSE( picture_t *p_dst,
             {
                 int x = 0;
 
-                /* See also easy-to-read C version below. */
-                const uint64_t b128 =  0x8080808080808080ULL;
-
                 __asm__ volatile (
-                    "movq %0, %%xmm5\n"
-                    "movq %1, %%xmm6\n"
-                    "movq %2, %%xmm7\n"
-                    :: "m" (b128), "m" (i_strength_u64), "m" (remove_high_u64)
-                    : "xmm5", "xmm6", "xmm7"
+                    "mov $0x80808080, %%eax\n"
+                    "movd %%eax, %%xmm5\n"
+                    "pshufd $0, %%xmm5, %%xmm5\n" /* 128 pattern */
+                    "movd %0, %%xmm6\n"
+                    "movd %1, %%xmm7\n"
+                    "pshufd $0, %%xmm7, %%xmm7\n" /* duplicate 32-bits across reg */
+                    :: "m" (i_strength), "m" (remove_high_u32)
+                    : "eax", "xmm5", "xmm6", "xmm7"
                 );
 
-                uint64_t *po8 = (uint64_t *)p_out;
-                for( ; x < w8; x += 8 )
+                uint8_t *po16 = p_out;
+                for( ; x < w16; x += 16 )
                 {
                     __asm__ volatile (
-                        "movq %0, %%xmm0\n"
+                        "movdqu %0, %%xmm0\n"
 
-                        "movq %%xmm5, %%xmm2\n" /* 128 */
-                        "movq %%xmm0, %%xmm1\n" /* copy of data */
+                        "movdqa %%xmm5, %%xmm2\n" /* 128 */
+                        "movdqa %%xmm0, %%xmm1\n" /* copy of data */
                         "psubusb %%xmm2, %%xmm1\n" /* xmm1 = max(data - 128, 0) */
                         "psubusb %%xmm0, %%xmm2\n" /* xmm2 = max(128 - data, 0) */
 
@@ -266,11 +263,11 @@ static void DarkenFieldSSE( picture_t *p_dst,
                         "psubb %%xmm2, %%xmm1\n"
                         "paddb %%xmm5, %%xmm1\n"
 
-                        "movq %%xmm1, %0\n"
+                        "movdqu %%xmm1, %0\n"
 
-                        : "=m" (*po8) :: "xmm0", "xmm1", "xmm2", "memory"
+                        : "=m" (*po16) :: "xmm0", "xmm1", "xmm2", "memory"
                     );
-                    po8++;
+                    po16++;
                 }
 
                 /* C version - handle the width remainder */
diff --git a/modules/video_filter/deinterlace/algo_x.c b/modules/video_filter/deinterlace/algo_x.c
index 09cbbc0acd..6b6e674070 100644
--- a/modules/video_filter/deinterlace/algo_x.c
+++ b/modules/video_filter/deinterlace/algo_x.c
@@ -73,9 +73,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
     return fc < 1 ? false : true;
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. Would that require
-   migration to a 16x16 processing model though? */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static inline int XDeint8x8DetectSSE( uint8_t *src, int i_src )
@@ -178,9 +175,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst,  int i_dst,
     }
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. Would that require
-   migration to a 16x16 processing model though? */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static inline void XDeint8x8MergeSSE( uint8_t *dst,  int i_dst,
@@ -257,9 +251,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
     }
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. Would that require
-   migration to a 16x16 processing model though? */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static inline void XDeint8x8FieldESSE( uint8_t *dst, int i_dst,
@@ -309,7 +300,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
         for( x = 0; x < 8; x++ )
         {
             uint8_t *src2 = &src[2*i_src];
-            /* I use 8 pixels just to match the MMX version, but it's overkill
+            /* I use 8 pixels just to match the SIMD version, but it's overkill
              * 5 would be enough (less isn't good) */
             const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
                            abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
@@ -339,9 +330,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
     }
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. Would that require
-   migration to a 16x16 processing model though? */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static inline void XDeint8x8FieldSSE( uint8_t *dst, int i_dst,
diff --git a/modules/video_filter/deinterlace/algo_x.h b/modules/video_filter/deinterlace/algo_x.h
index dd70cbd732..1667d663ba 100644
--- a/modules/video_filter/deinterlace/algo_x.h
+++ b/modules/video_filter/deinterlace/algo_x.h
@@ -33,13 +33,13 @@ struct picture_t;
 /**
  * Interpolating deinterlace filter "X".
  *
- * The algorithm works on a 8x8 block basic, it copies the top field
+ * The algorithm works on a 8x8 block basis; It copies the top field
  * and applies a process to recreate the bottom field.
  *
  * If a 8x8 block is classified as :
  *   - progressive: it applies a small blend (1,6,1)
  *   - interlaced:
- *    * in the MMX version: we do a ME between the 2 fields, if there is a
+ *    * in the SIMD version: we do a ME between the 2 fields, if there is a
  *      good match we use MC to recreate the bottom field (with a small
  *      blend (1,6,1) )
  *    * otherwise: it recreates the bottom field by an edge oriented
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index 759164c32c..1b02dd8d04 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -169,8 +169,7 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
     return (i_motion >= 8);
 }
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. */
+/* Note: This examines an 8x8 block just like the C equivalent */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static int TestForMotionInBlockSSE( uint8_t *p_pix_p, uint8_t *p_pix_c,
@@ -467,8 +466,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 /* Threshold (value from Transcode 1.1.5) */
 #define T 100
 
-/* TODO: This is a simple conversion of MMX to using SSE registers,
-   without making use of their expanded width. */
 #ifdef CAN_COMPILE_SSE
 VLC_SSE
 static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
@@ -494,8 +491,8 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
         const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
         const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
                              p_pic_bot->p[i_plane].i_visible_pitch );
-        const int wm8 = w % 8;   /* remainder */
-        const int w8  = w - wm8; /* part of width that is divisible by 8 */
+        const int wm16 = w % 16;   /* remainder */
+        const int w16  = w - wm16; /* part of width that is divisible by 16 */
 
         /* Current line / neighbouring lines picture pointers */
         const picture_t *cur = p_pic_bot;
@@ -521,17 +518,20 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
                             # of pixels < (2^32)/255
                Note: calculates score * 255
             */
-            const uint64_t b128 = 0x8080808080808080ULL;
-            const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
+            const uint8_t bT[16] = { T, T, T, T, T, T, T, T,
+                                     T, T, T, T, T, T, T, T };
 
-            for( ; x < w8; x += 8 )
+            for( ; x < w16; x += 16 )
             {
                 __asm__ volatile (
-                    "movq %0, %%xmm0\n"
-                    "movq %1, %%xmm1\n"
-                    "movq %2, %%xmm2\n"
+                    "movdqu %0, %%xmm0\n"
+                    "movdqu %1, %%xmm1\n"
+                    "movdqu %2, %%xmm2\n"
+
+                    "mov $0x80808080, %%eax\n"
+                    "movd %%eax, %%xmm3\n"
+                    "pshufd $0, %%xmm3, %%xmm3\n" /* 128 pattern */
 
-                    "movq %3, %%xmm3\n"
                     "psubb %%xmm3, %%xmm0\n"
                     "psubb %%xmm3, %%xmm1\n"
                     "psubb %%xmm3, %%xmm2\n"
@@ -552,7 +552,7 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
                     "pmulhw %%xmm3, %%xmm4\n"
                     "pmulhw %%xmm5, %%xmm6\n"
 
-                    "movq %4, %%xmm0\n"
+                    "movq %3, %%xmm0\n"
                     "pxor %%xmm1, %%xmm1\n"
 
                     "packsswb %%xmm4, %%xmm6\n"
@@ -560,16 +560,14 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
                     "psadbw %%xmm1, %%xmm6\n"
                     "paddd %%xmm6, %%xmm7\n"
 
-                    :: "m" (*((int64_t*)p_c)),
-                       "m" (*((int64_t*)p_p)),
-                       "m" (*((int64_t*)p_n)),
-                       "m" (b128), "m" (bT)
-                    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+                    :: "m" (*p_c), "m" (*p_p), "m" (*p_n), "m" (bT)
+                    : "eax", "xmm0", "xmm1", "xmm2", "xmm3",
+                      "xmm4", "xmm5", "xmm6", "xmm7"
                 );
 
-                p_c += 8;
-                p_p += 8;
-                p_n += 8;
+                p_c += 16;
+                p_p += 16;
+                p_n += 16;
             }
 
             for( ; x < w; ++x )