[vlc-devel] [PATCH 2/2] deinterlace: x86: convert CalculateInterlaceScoreMMX inline asm to x86inc/yasm

Fri Dec 2 00:04:42 CET 2016

Probably faster than the inline asm it replaces. On a full hd clip it is
~2.4 times than the C code on haswell CPU:
 5355071.5 cycles in CalculateInterlaceScoreMMX, 2048 runs, 0 skips
12854904.7 cycles in CalculateInterlaceScore, 2048 runs, 0 skips
---
 .../video_filter/deinterlace/deinterlace_x86.asm   | 151 +++++++++++++++++++++
 modules/video_filter/deinterlace/helpers.c         | 137 ++++---------------
 2 files changed, 177 insertions(+), 111 deletions(-)

diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index e102dd850a..1019d1b2bd 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -26,11 +26,38 @@
 %include "x86util.asm"
 
 SECTION_RODATA
+db_128:
+times 8 db 128
 db_threshold:
 times 8 db 10
+db_threshold_2:
+times 8 db 100
 dw_motion_thresh:
 times 4 dw (8 * 255 - 1)
 
+db_mask1:
+        times 1 db 0xff
+        times 7 db 0x00
+db_mask2:
+        times 2 db 0xff
+        times 6 db 0x00
+db_mask3:
+        times 3 db 0xff
+        times 5 db 0x00
+db_mask4:
+        times 4 db 0xff
+        times 4 db 0x00
+db_mask5:
+        times 5 db 0xff
+        times 3 db 0x00
+db_mask6:
+        times 6 db 0xff
+        times 2 db 0x00
+db_mask7:
+        times 7 db 0xff
+        times 1 db 0x00
+
+
 SECTION .text
 
 cglobal emms_ext_asm, 0,0,0
@@ -103,3 +130,127 @@ cglobal test_motion_block, 4,7,8
         pmovmskb        eax, m3
         and             eax, 1
         RET
+
+%macro CALC_ISCORE 0
+        psubb           m0,  [db_128]
+        psubb           m1,  [db_128]
+        psubb           m2,  [db_128]
+        psubb           m3,  [db_128]
+        mova            m4,  m2
+
+        psubsb          m0,  m1 ; (P - C)
+        psubsb          m2,  m1 ; (N - C), (C' - P')
+        psubsb          m4,  m3 ; (C' - N')
+        SWAP            4, 1
+
+        ;; sign extend to 16 bit
+        punpckhbw       m3,  m0
+        punpckhbw       m4,  m1
+        punpckhbw       m5,  m2
+        punpcklbw       m0,  m0
+        punpcklbw       m1,  m1
+        punpcklbw       m2,  m2
+
+        psraw           m0,  8
+        psraw           m1,  8
+        psraw           m2,  8
+        psraw           m3,  8
+        psraw           m4,  8
+        psraw           m5,  8
+
+        pmullw          m0,  m2
+        pmullw          m2,  m1
+        pmullw          m3,  m5
+        pmullw          m5,  m4
+
+        packsswb        m0,  m3
+        packsswb        m2,  m5
+        pxor            m4,  m4
+        pxor            m5,  m5
+        pcmpgtb         m0,  [db_threshold_2]
+        pcmpgtb         m2,  [db_threshold_2]
+        ;; turn 0xff (-1) to 0x01 (1)
+        psubb           m4,  m0
+        psubb           m5,  m2
+%endmacro
+
+cglobal calc_interlace_score, 6,7,8, 0, curr, neigh, cstride, nstride, w, h
+        lea             currq, [currq + cstrideq]
+        pxor            m6,  m6
+        pxor            m7,  m7
+        shl             cstrideq, 1
+        shl             nstrideq, 1
+        sub             hd,  2
+.line_loop:
+        mov             r6d, wd
+        sub             r6d, 8
+.loop:
+        movq            m0,  [neighq + r6]
+        movq            m1,  [currq  + r6]
+
+        add             currq,  cstrideq
+        add             neighq, nstrideq
+
+        movq            m2,  [neighq + r6]
+        movq            m3,  [currq  + r6]
+
+        CALC_ISCORE
+
+        psadbw          m4,  m6
+        psadbw          m5,  m6
+
+        paddw           m4,  m5
+        paddd           m7,  m4
+
+        sub             currq,  cstrideq
+        sub             neighq, nstrideq
+        sub             r6d, 8
+        jge             .loop
+
+        lea             currq,  [currq  + cstrideq]
+        lea             neighq, [neighq + nstrideq]
+
+        sub             hd,  2
+        jg              .line_loop
+
+        movd            eax, m7
+        RET
+
+cglobal calc_interlace_score_partial, 6,6,8, 0, curr, neigh, cstride, nstride, w, h
+        lea             currq, [currq + cstrideq]
+        pxor            m6,  m6
+        pxor            m7,  m7
+        shl             cstrideq, 1
+        shl             nstrideq, 1
+        lea             r6,  [db_mask1]
+        sub             wd,  1
+        sub             hd,  2
+        lea             r6,  [r6 + 8 * wq]
+.line_loop:
+        movq            m0,  [neighq]
+        movq            m1,  [currq]
+
+        add             currq,  cstrideq
+        add             neighq, nstrideq
+
+        movq            m2,  [neighq]
+        movq            m3,  [currq]
+
+        CALC_ISCORE
+
+        ;; select partial reg
+        pand            m4,  [r6]
+        pand            m5,  [r6]
+
+        psadbw          m4,  m6
+        psadbw          m5,  m6
+
+        paddw           m4,  m5
+        punpcklwd       m4,  m6
+        paddd           m7,  m4
+
+        sub             hd,  2
+        jg              .line_loop
+
+        movd            eax, m7
+        RET
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index c24464cc16..14b7a5b619 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -25,10 +25,6 @@
 #   include "config.h"
 #endif
 
-#ifdef CAN_COMPILE_MMXEXT
-#   include "mmx.h"
-#endif
-
 #include <stdint.h>
 #include <assert.h>
 
@@ -389,19 +385,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
 /* Threshold (value from Transcode 1.1.5) */
 #define T 100
 
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
+#if HAVE_YASM
+int vlcpriv_calc_interlace_score_mmx2(const uint8_t *p_pic_curr,
+                                      const uint8_t *p_pic_neigh,
+                                      size_t pitch_curr, size_t pitch_neigh,
+                                      int width, int height);
+int vlcpriv_calc_interlace_score_partial_mmx2(const uint8_t *p_pic_curr,
+                                          const uint8_t *p_pic_neigh,
+                                          size_t pitch_curr, size_t pitch_neigh,
+                                          int width, int height);
+
 static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
                                        const picture_t* p_pic_bot )
 {
     assert( p_pic_top->i_planes == p_pic_bot->i_planes );
 
-    /* Amount of bits must be known for MMX, thus int32_t.
-       Doesn't hurt the C implementation. */
-    int32_t i_score_mmx = 0; /* this must be divided by 255 when finished  */
-    int32_t i_score_c   = 0; /* this counts as-is (used for non-MMX parts) */
-
-    pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
+    int32_t i_score = 0;
 
     for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
     {
@@ -410,7 +409,7 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
             p_pic_bot->p[i_plane].i_visible_lines )
             return -1;
 
-        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
+        const int i_lasty = p_pic_top->p[i_plane].i_visible_lines;
         const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
                              p_pic_bot->p[i_plane].i_visible_pitch );
         const int wm8 = w % 8;   /* remainder */
@@ -419,106 +418,22 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
         /* Current line / neighbouring lines picture pointers */
         const picture_t *cur = p_pic_bot;
         const picture_t *ngh = p_pic_top;
-        int wc = cur->p[i_plane].i_pitch;
-        int wn = ngh->p[i_plane].i_pitch;
-
-        /* Transcode 1.1.5 only checks every other line. Checking every line
-           works better for anime, which may contain horizontal,
-           one pixel thick cartoon outlines.
-        */
-        for( int y = 1; y < i_lasty; ++y )
-        {
-            uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc];     /* this line */
-            uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
-            uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
-
-            int x = 0;
-
-            /* Easy-to-read C version further below.
-
-               Assumptions: 0 < T < 127
-                            # of pixels < (2^32)/255
-               Note: calculates score * 255
-            */
-            static const mmx_t b0   = { .uq = 0x0000000000000000ULL };
-            static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
-            static const mmx_t bT   = { .ub = { T, T, T, T, T, T, T, T } };
-
-            for( ; x < w8; x += 8 )
-            {
-                movq_m2r( *((int64_t*)p_c), mm0 );
-                movq_m2r( *((int64_t*)p_p), mm1 );
-                movq_m2r( *((int64_t*)p_n), mm2 );
-
-                psubb_m2r( b128, mm0 );
-                psubb_m2r( b128, mm1 );
-                psubb_m2r( b128, mm2 );
-
-                psubsb_r2r( mm0, mm1 );
-                psubsb_r2r( mm0, mm2 );
-
-                pxor_r2r( mm3, mm3 );
-                pxor_r2r( mm4, mm4 );
-                pxor_r2r( mm5, mm5 );
-                pxor_r2r( mm6, mm6 );
-
-                punpcklbw_r2r( mm1, mm3 );
-                punpcklbw_r2r( mm2, mm4 );
-                punpckhbw_r2r( mm1, mm5 );
-                punpckhbw_r2r( mm2, mm6 );
-
-                pmulhw_r2r( mm3, mm4 );
-                pmulhw_r2r( mm5, mm6 );
-
-                packsswb_r2r(mm4, mm6);
-                pcmpgtb_m2r( bT, mm6 );
-                psadbw_m2r( b0, mm6 );
-                paddd_r2r( mm6, mm7 );
-
-                p_c += 8;
-                p_p += 8;
-                p_n += 8;
-            }
-
-            for( ; x < w; ++x )
-            {
-                /* Worst case: need 17 bits for "comb". */
-                int_fast32_t C = *p_c;
-                int_fast32_t P = *p_p;
-                int_fast32_t N = *p_n;
-
-                /* Comments in Transcode's filter_ivtc.c attribute this
-                   combing metric to Gunnar Thalin.
-
-                    The idea is that if the picture is interlaced, both
-                    expressions will have the same sign, and this comes
-                    up positive. The value T = 100 has been chosen such
-                    that a pixel difference of 10 (on average) will
-                    trigger the detector.
-                */
-                int_fast32_t comb = (P - C) * (N - C);
-                if( comb > T )
-                    ++i_score_c;
-
-                ++p_c;
-                ++p_p;
-                ++p_n;
-            }
-
-            /* Now the other field - swap current and neighbour pictures */
-            const picture_t *tmp = cur;
-            cur = ngh;
-            ngh = tmp;
-            int tmp_pitch = wc;
-            wc = wn;
-            wn = tmp_pitch;
-        }
+        size_t wc = cur->p[i_plane].i_pitch;
+        size_t wn = ngh->p[i_plane].i_pitch;
+
+        i_score += vlcpriv_calc_interlace_score_mmx2(cur->p[i_plane].p_pixels,
+                                                     ngh->p[i_plane].p_pixels,
+                                                     wc, wn, w8, i_lasty);
+        if (wm8)
+            i_score += vlcpriv_calc_interlace_score_partial_mmx2(
+                cur->p[i_plane].p_pixels + w8,
+                ngh->p[i_plane].p_pixels + w8,
+                wc, wn, wm8, i_lasty);
     }
 
-    movd_r2m( mm7, i_score_mmx );
-    emms();
+    vlcpriv_emms_ext_asm();
 
-    return i_score_mmx/255 + i_score_c;
+    return i_score;
 }
 #endif
 
@@ -542,7 +457,7 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
     if( p_pic_top->i_planes != p_pic_bot->i_planes )
         return -1;
 
-#ifdef CAN_COMPILE_MMXEXT
+#if HAVE_YASM
     if (vlc_CPU_MMXEXT())
         return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
 #endif
-- 
2.11.0.rc2