[vlc-devel] [PATCH 1/2] deinterlace: x86: convert TestForMotionInBlockMMX inline asm to x86inc/yasm

Fri Dec 2 00:04:41 CET 2016

Fixes a bug in the inline asm. pcmpgtb is a signed comparisson. So the
inline asm handles absolute pixel differences >= 128 as smaller than the
threshold 10. The external asm subtracts the threshold instead with
unsigned saturation and compares the result with zero.
It is probably still faster than the inline asm version. It computes two
blocks at once and actually computes the absotue difference.

The external asm is ~2.6 times faster than C on a haswell CPU:
125.1 cycles in TestForMotionInBlock_mmx2, 33553463 runs, 969 skips
334.8 cycles in TestForMotionInBlock_c, 33552874 runs, 1558 skips
---
 modules/video_filter/deinterlace/algo_ivtc.c       |  4 -
 .../video_filter/deinterlace/deinterlace_x86.asm   | 67 ++++++++++++++++
 modules/video_filter/deinterlace/helpers.c         | 92 ++++------------------
 3 files changed, 83 insertions(+), 80 deletions(-)

diff --git a/modules/video_filter/deinterlace/algo_ivtc.c b/modules/video_filter/deinterlace/algo_ivtc.c
index 35f920beeb..5b2a86034f 100644
--- a/modules/video_filter/deinterlace/algo_ivtc.c
+++ b/modules/video_filter/deinterlace/algo_ivtc.c
@@ -25,10 +25,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #include <stdint.h>
 #include <assert.h>
 
diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index 322c268138..e102dd850a 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -26,6 +26,10 @@
 %include "x86util.asm"
 
 SECTION_RODATA
+db_threshold:
+times 8 db 10
+dw_motion_thresh:
+times 4 dw (8 * 255 - 1)
 
 SECTION .text
 
@@ -36,3 +40,66 @@ cglobal emms_ext_asm, 0,0,0
 cglobal femms_ext_asm, 0,0,0
         femms
         RET
+
+INIT_MMX mmx2
+
+%if ARCH_X86_32 || WIN64
+cglobal test_motion_block, 4,5,8
+%define cntr r4d
+%else
+cglobal test_motion_block, 4,7,8
+%define cntr r6d
+%endif
+        pxor            m4,  m4
+        pxor            m5,  m5
+        pcmpeqb         m6,  m6
+        movq            m7,  [db_threshold]
+        mov             cntr, 4
+.loop:
+        movq            m0,  [r1]
+        movq            m1,  [r0]
+        movq            m2,  [r1 + r3]
+        movq            m3,  [r0 + r2]
+        psubusb         m0,  m1
+        psubusb         m1,  [r1]
+        psubusb         m2,  m3
+        psubusb         m3,  [r1 + r3]
+        por             m0,  m1
+        por             m2,  m3
+        pxor            m1,  m1
+        psubusb         m0,  m7
+        psubusb         m2,  m7
+        pcmpeqb         m0,  m1
+        pcmpeqb         m2,  m1
+        psadbw          m0,  m6
+        psadbw          m2,  m6
+        paddw           m4,  m0
+        paddw           m5,  m2
+
+        lea             r1, [r1 + 2 * r3]
+        lea             r0, [r0 + 2 * r2]
+        dec             cntr
+        jg              .loop
+
+        movq            m3,  m4
+        pcmpgtw         m4,  [dw_motion_thresh]
+        paddw           m3,  m5
+        pcmpgtw         m5,  [dw_motion_thresh]
+        pcmpgtw         m3,  [dw_motion_thresh]
+        pmovmskb        eax, m4
+        pmovmskb        r1d, m5
+        and             eax, 1
+        and             r1d, 1
+
+%if ARCH_X86_32 || WIN64
+        mov             r2,  r4m
+        mov             r3,  r5m
+        mov             [r2], eax
+        mov             [r3], r1d
+%else
+        mov             [r4], eax
+        mov             [r5], r1d
+%endif
+        pmovmskb        eax, m3
+        and             eax, 1
+        RET
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index 3d6e97e895..c24464cc16 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
         p_dst->p_pixels += p_src->i_pitch;
 }
 
+// this threshold value is hardcoded in db_threshold in deinterlace_x86.asm
 #define T 10
 /**
  * Internal helper function for EstimateNumBlocksWithMotion():
@@ -120,7 +121,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
  * @see EstimateNumBlocksWithMotion()
  */
 static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
-                                 int i_pitch_prev, int i_pitch_curr,
+                                 size_t i_pitch_prev, size_t i_pitch_curr,
                                  int* pi_top, int* pi_bot )
 {
 /* Pixel luma/chroma difference threshold to detect motion. */
@@ -164,6 +165,9 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
        Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
        leading to more interlacing artifacts than by just using the emergency
        mode frame composer.
+
+       Please note that these thresholds are hardcoded as dw_motion_thresh
+       in deinterlace_x86.asm
     */
     (*pi_top) = ( i_top_motion >= 8 );
     (*pi_bot) = ( i_bot_motion >= 8 );
@@ -172,78 +176,10 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
        changes "enough". */
     return (i_motion >= 8);
 }
-
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
-                                    int i_pitch_prev, int i_pitch_curr,
-                                    int* pi_top, int* pi_bot )
-{
-    int32_t i_motion = 0;
-    int32_t i_top_motion = 0;
-    int32_t i_bot_motion = 0;
-
-    static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
-    pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
-    movq_m2r( bT,  mm5 );
-
-    pxor_r2r( mm3, mm3 ); /* score (top field) */
-    pxor_r2r( mm4, mm4 ); /* score (bottom field) */
-    for( int y = 0; y < 8; y+=2 )
-    {
-        /* top field */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
-
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
-
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm3 ); /* add to top field score */
-
-        p_pix_c += i_pitch_curr;
-        p_pix_p += i_pitch_prev;
-
-        /* bottom field - handling identical to top field, except... */
-        movq_m2r( *((uint64_t*)p_pix_c), mm0 );
-        movq_m2r( *((uint64_t*)p_pix_p), mm1 );
-        movq_r2r( mm0, mm2 );
-        psubusb_r2r( mm1, mm2 );
-        psubusb_r2r( mm0, mm1 );
-
-        pcmpgtb_r2r( mm5, mm2 );
-        pcmpgtb_r2r( mm5, mm1 );
-        psadbw_r2r(  mm6, mm2 );
-        psadbw_r2r(  mm6, mm1 );
-
-        paddd_r2r( mm2, mm1 );
-        paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
-
-        p_pix_c += i_pitch_curr;
-        p_pix_p += i_pitch_prev;
-    }
-    movq_r2r(  mm3, mm7 ); /* score (total) */
-    paddd_r2r( mm4, mm7 );
-    movd_r2m( mm3, i_top_motion );
-    movd_r2m( mm4, i_bot_motion );
-    movd_r2m( mm7, i_motion );
-
-    /* The loop counts actual score * 255. */
-    i_top_motion /= 255;
-    i_bot_motion /= 255;
-    i_motion     /= 255;
-
-    emms();
-
-    (*pi_top) = ( i_top_motion >= 8 );
-    (*pi_bot) = ( i_bot_motion >= 8 );
-    return (i_motion >= 8);
-}
+#if HAVE_YASM
+int vlcpriv_test_motion_block_mmx2(uint8_t *p_pix_p, uint8_t *p_pix_c,
+                                   size_t i_pitch_prev, size_t i_pitch_curr,
+                                   int *pi_top, int *pi_bot);
 #endif
 #undef T
 
@@ -392,12 +328,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
     if( p_prev->i_planes != p_curr->i_planes )
         return -1;
 
-    int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
+    int (*motion_in_block)(uint8_t *, uint8_t *, size_t , size_t, int *, int *) =
         TestForMotionInBlock;
     /* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
+#ifdef HAVE_YASM
     if (vlc_CPU_MMXEXT())
-        motion_in_block = TestForMotionInBlockMMX;
+        motion_in_block = vlcpriv_test_motion_block_mmx2;
 #endif
 
     int i_score = 0;
@@ -437,6 +373,10 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
             }
         }
     }
+#ifdef HAVE_YASM
+    if (vlc_CPU_MMXEXT())
+        vlcpriv_emms_ext_asm();
+#endif
 
     if( pi_top )
         (*pi_top) = i_score_top;
-- 
2.11.0.rc2