[vlc-devel] [PATCH 2/2] deinterlace: x86: convert CalculateInterlaceScoreMMX inline asm to x86inc/yasm
Janne Grunau
janne-vlc at jannau.net
Fri Dec 2 00:04:42 CET 2016
Probably faster than the inline asm it replaces. On a full hd clip it is
~2.4 times than the C code on haswell CPU:
5355071.5 cycles in CalculateInterlaceScoreMMX, 2048 runs, 0 skips
12854904.7 cycles in CalculateInterlaceScore, 2048 runs, 0 skips
---
.../video_filter/deinterlace/deinterlace_x86.asm | 151 +++++++++++++++++++++
modules/video_filter/deinterlace/helpers.c | 137 ++++---------------
2 files changed, 177 insertions(+), 111 deletions(-)
diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index e102dd850a..1019d1b2bd 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -26,11 +26,38 @@
%include "x86util.asm"
SECTION_RODATA
+db_128:
+times 8 db 128
db_threshold:
times 8 db 10
+db_threshold_2:
+times 8 db 100
dw_motion_thresh:
times 4 dw (8 * 255 - 1)
+db_mask1:
+ times 1 db 0xff
+ times 7 db 0x00
+db_mask2:
+ times 2 db 0xff
+ times 6 db 0x00
+db_mask3:
+ times 3 db 0xff
+ times 5 db 0x00
+db_mask4:
+ times 4 db 0xff
+ times 4 db 0x00
+db_mask5:
+ times 5 db 0xff
+ times 3 db 0x00
+db_mask6:
+ times 6 db 0xff
+ times 2 db 0x00
+db_mask7:
+ times 7 db 0xff
+ times 1 db 0x00
+
+
SECTION .text
cglobal emms_ext_asm, 0,0,0
@@ -103,3 +130,127 @@ cglobal test_motion_block, 4,7,8
pmovmskb eax, m3
and eax, 1
RET
+
+%macro CALC_ISCORE 0
+ psubb m0, [db_128]
+ psubb m1, [db_128]
+ psubb m2, [db_128]
+ psubb m3, [db_128]
+ mova m4, m2
+
+ psubsb m0, m1 ; (P - C)
+ psubsb m2, m1 ; (N - C), (C' - P')
+ psubsb m4, m3 ; (C' - N')
+ SWAP 4, 1
+
+ ;; sign extend to 16 bit
+ punpckhbw m3, m0
+ punpckhbw m4, m1
+ punpckhbw m5, m2
+ punpcklbw m0, m0
+ punpcklbw m1, m1
+ punpcklbw m2, m2
+
+ psraw m0, 8
+ psraw m1, 8
+ psraw m2, 8
+ psraw m3, 8
+ psraw m4, 8
+ psraw m5, 8
+
+ pmullw m0, m2
+ pmullw m2, m1
+ pmullw m3, m5
+ pmullw m5, m4
+
+ packsswb m0, m3
+ packsswb m2, m5
+ pxor m4, m4
+ pxor m5, m5
+ pcmpgtb m0, [db_threshold_2]
+ pcmpgtb m2, [db_threshold_2]
+ ;; turn 0xff (-1) to 0x01 (1)
+ psubb m4, m0
+ psubb m5, m2
+%endmacro
+
+cglobal calc_interlace_score, 6,7,8, 0, curr, neigh, cstride, nstride, w, h
+ lea currq, [currq + cstrideq]
+ pxor m6, m6
+ pxor m7, m7
+ shl cstrideq, 1
+ shl nstrideq, 1
+ sub hd, 2
+.line_loop:
+ mov r6d, wd
+ sub r6d, 8
+.loop:
+ movq m0, [neighq + r6]
+ movq m1, [currq + r6]
+
+ add currq, cstrideq
+ add neighq, nstrideq
+
+ movq m2, [neighq + r6]
+ movq m3, [currq + r6]
+
+ CALC_ISCORE
+
+ psadbw m4, m6
+ psadbw m5, m6
+
+ paddw m4, m5
+ paddd m7, m4
+
+ sub currq, cstrideq
+ sub neighq, nstrideq
+ sub r6d, 8
+ jge .loop
+
+ lea currq, [currq + cstrideq]
+ lea neighq, [neighq + nstrideq]
+
+ sub hd, 2
+ jg .line_loop
+
+ movd eax, m7
+ RET
+
+cglobal calc_interlace_score_partial, 6,6,8, 0, curr, neigh, cstride, nstride, w, h
+ lea currq, [currq + cstrideq]
+ pxor m6, m6
+ pxor m7, m7
+ shl cstrideq, 1
+ shl nstrideq, 1
+ lea r6, [db_mask1]
+ sub wd, 1
+ sub hd, 2
+ lea r6, [r6 + 8 * wq]
+.line_loop:
+ movq m0, [neighq]
+ movq m1, [currq]
+
+ add currq, cstrideq
+ add neighq, nstrideq
+
+ movq m2, [neighq]
+ movq m3, [currq]
+
+ CALC_ISCORE
+
+ ;; select partial reg
+ pand m4, [r6]
+ pand m5, [r6]
+
+ psadbw m4, m6
+ psadbw m5, m6
+
+ paddw m4, m5
+ punpcklwd m4, m6
+ paddd m7, m4
+
+ sub hd, 2
+ jg .line_loop
+
+ movd eax, m7
+ RET
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index c24464cc16..14b7a5b619 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -25,10 +25,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#include <stdint.h>
#include <assert.h>
@@ -389,19 +385,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
/* Threshold (value from Transcode 1.1.5) */
#define T 100
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
+#if HAVE_YASM
+int vlcpriv_calc_interlace_score_mmx2(const uint8_t *p_pic_curr,
+ const uint8_t *p_pic_neigh,
+ size_t pitch_curr, size_t pitch_neigh,
+ int width, int height);
+int vlcpriv_calc_interlace_score_partial_mmx2(const uint8_t *p_pic_curr,
+ const uint8_t *p_pic_neigh,
+ size_t pitch_curr, size_t pitch_neigh,
+ int width, int height);
+
static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
assert( p_pic_top->i_planes == p_pic_bot->i_planes );
- /* Amount of bits must be known for MMX, thus int32_t.
- Doesn't hurt the C implementation. */
- int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
- int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
-
- pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
+ int32_t i_score = 0;
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
@@ -410,7 +409,7 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
p_pic_bot->p[i_plane].i_visible_lines )
return -1;
- const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
+ const int i_lasty = p_pic_top->p[i_plane].i_visible_lines;
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
p_pic_bot->p[i_plane].i_visible_pitch );
const int wm8 = w % 8; /* remainder */
@@ -419,106 +418,22 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
/* Current line / neighbouring lines picture pointers */
const picture_t *cur = p_pic_bot;
const picture_t *ngh = p_pic_top;
- int wc = cur->p[i_plane].i_pitch;
- int wn = ngh->p[i_plane].i_pitch;
-
- /* Transcode 1.1.5 only checks every other line. Checking every line
- works better for anime, which may contain horizontal,
- one pixel thick cartoon outlines.
- */
- for( int y = 1; y < i_lasty; ++y )
- {
- uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
- uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
- uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
-
- int x = 0;
-
- /* Easy-to-read C version further below.
-
- Assumptions: 0 < T < 127
- # of pixels < (2^32)/255
- Note: calculates score * 255
- */
- static const mmx_t b0 = { .uq = 0x0000000000000000ULL };
- static const mmx_t b128 = { .uq = 0x8080808080808080ULL };
- static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
-
- for( ; x < w8; x += 8 )
- {
- movq_m2r( *((int64_t*)p_c), mm0 );
- movq_m2r( *((int64_t*)p_p), mm1 );
- movq_m2r( *((int64_t*)p_n), mm2 );
-
- psubb_m2r( b128, mm0 );
- psubb_m2r( b128, mm1 );
- psubb_m2r( b128, mm2 );
-
- psubsb_r2r( mm0, mm1 );
- psubsb_r2r( mm0, mm2 );
-
- pxor_r2r( mm3, mm3 );
- pxor_r2r( mm4, mm4 );
- pxor_r2r( mm5, mm5 );
- pxor_r2r( mm6, mm6 );
-
- punpcklbw_r2r( mm1, mm3 );
- punpcklbw_r2r( mm2, mm4 );
- punpckhbw_r2r( mm1, mm5 );
- punpckhbw_r2r( mm2, mm6 );
-
- pmulhw_r2r( mm3, mm4 );
- pmulhw_r2r( mm5, mm6 );
-
- packsswb_r2r(mm4, mm6);
- pcmpgtb_m2r( bT, mm6 );
- psadbw_m2r( b0, mm6 );
- paddd_r2r( mm6, mm7 );
-
- p_c += 8;
- p_p += 8;
- p_n += 8;
- }
-
- for( ; x < w; ++x )
- {
- /* Worst case: need 17 bits for "comb". */
- int_fast32_t C = *p_c;
- int_fast32_t P = *p_p;
- int_fast32_t N = *p_n;
-
- /* Comments in Transcode's filter_ivtc.c attribute this
- combing metric to Gunnar Thalin.
-
- The idea is that if the picture is interlaced, both
- expressions will have the same sign, and this comes
- up positive. The value T = 100 has been chosen such
- that a pixel difference of 10 (on average) will
- trigger the detector.
- */
- int_fast32_t comb = (P - C) * (N - C);
- if( comb > T )
- ++i_score_c;
-
- ++p_c;
- ++p_p;
- ++p_n;
- }
-
- /* Now the other field - swap current and neighbour pictures */
- const picture_t *tmp = cur;
- cur = ngh;
- ngh = tmp;
- int tmp_pitch = wc;
- wc = wn;
- wn = tmp_pitch;
- }
+ size_t wc = cur->p[i_plane].i_pitch;
+ size_t wn = ngh->p[i_plane].i_pitch;
+
+ i_score += vlcpriv_calc_interlace_score_mmx2(cur->p[i_plane].p_pixels,
+ ngh->p[i_plane].p_pixels,
+ wc, wn, w8, i_lasty);
+ if (wm8)
+ i_score += vlcpriv_calc_interlace_score_partial_mmx2(
+ cur->p[i_plane].p_pixels + w8,
+ ngh->p[i_plane].p_pixels + w8,
+ wc, wn, wm8, i_lasty);
}
- movd_r2m( mm7, i_score_mmx );
- emms();
+ vlcpriv_emms_ext_asm();
- return i_score_mmx/255 + i_score_c;
+ return i_score;
}
#endif
@@ -542,7 +457,7 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
-#ifdef CAN_COMPILE_MMXEXT
+#if HAVE_YASM
if (vlc_CPU_MMXEXT())
return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
#endif
--
2.11.0.rc2
More information about the vlc-devel
mailing list