[vlc-devel] [PATCH 1/2] deinterlace: x86: convert TestForMotionInBlockMMX inline asm to x86inc/yasm
Janne Grunau
janne-vlc at jannau.net
Fri Dec 2 00:04:41 CET 2016
Fixes a bug in the inline asm. pcmpgtb is a signed comparisson. So the
inline asm handles absolute pixel differences >= 128 as smaller than the
threshold 10. The external asm subtracts the threshold instead with
unsigned saturation and compares the result with zero.
It is probably still faster than the inline asm version. It computes two
blocks at once and actually computes the absotue difference.
The external asm is ~2.6 times faster than C on a haswell CPU:
125.1 cycles in TestForMotionInBlock_mmx2, 33553463 runs, 969 skips
334.8 cycles in TestForMotionInBlock_c, 33552874 runs, 1558 skips
---
modules/video_filter/deinterlace/algo_ivtc.c | 4 -
.../video_filter/deinterlace/deinterlace_x86.asm | 67 ++++++++++++++++
modules/video_filter/deinterlace/helpers.c | 92 ++++------------------
3 files changed, 83 insertions(+), 80 deletions(-)
diff --git a/modules/video_filter/deinterlace/algo_ivtc.c b/modules/video_filter/deinterlace/algo_ivtc.c
index 35f920beeb..5b2a86034f 100644
--- a/modules/video_filter/deinterlace/algo_ivtc.c
+++ b/modules/video_filter/deinterlace/algo_ivtc.c
@@ -25,10 +25,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#include <stdint.h>
#include <assert.h>
diff --git a/modules/video_filter/deinterlace/deinterlace_x86.asm b/modules/video_filter/deinterlace/deinterlace_x86.asm
index 322c268138..e102dd850a 100644
--- a/modules/video_filter/deinterlace/deinterlace_x86.asm
+++ b/modules/video_filter/deinterlace/deinterlace_x86.asm
@@ -26,6 +26,10 @@
%include "x86util.asm"
SECTION_RODATA
+db_threshold:
+times 8 db 10
+dw_motion_thresh:
+times 4 dw (8 * 255 - 1)
SECTION .text
@@ -36,3 +40,66 @@ cglobal emms_ext_asm, 0,0,0
cglobal femms_ext_asm, 0,0,0
femms
RET
+
+INIT_MMX mmx2
+
+%if ARCH_X86_32 || WIN64
+cglobal test_motion_block, 4,5,8
+%define cntr r4d
+%else
+cglobal test_motion_block, 4,7,8
+%define cntr r6d
+%endif
+ pxor m4, m4
+ pxor m5, m5
+ pcmpeqb m6, m6
+ movq m7, [db_threshold]
+ mov cntr, 4
+.loop:
+ movq m0, [r1]
+ movq m1, [r0]
+ movq m2, [r1 + r3]
+ movq m3, [r0 + r2]
+ psubusb m0, m1
+ psubusb m1, [r1]
+ psubusb m2, m3
+ psubusb m3, [r1 + r3]
+ por m0, m1
+ por m2, m3
+ pxor m1, m1
+ psubusb m0, m7
+ psubusb m2, m7
+ pcmpeqb m0, m1
+ pcmpeqb m2, m1
+ psadbw m0, m6
+ psadbw m2, m6
+ paddw m4, m0
+ paddw m5, m2
+
+ lea r1, [r1 + 2 * r3]
+ lea r0, [r0 + 2 * r2]
+ dec cntr
+ jg .loop
+
+ movq m3, m4
+ pcmpgtw m4, [dw_motion_thresh]
+ paddw m3, m5
+ pcmpgtw m5, [dw_motion_thresh]
+ pcmpgtw m3, [dw_motion_thresh]
+ pmovmskb eax, m4
+ pmovmskb r1d, m5
+ and eax, 1
+ and r1d, 1
+
+%if ARCH_X86_32 || WIN64
+ mov r2, r4m
+ mov r3, r5m
+ mov [r2], eax
+ mov [r3], r1d
+%else
+ mov [r4], eax
+ mov [r5], r1d
+%endif
+ pmovmskb eax, m3
+ and eax, 1
+ RET
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index 3d6e97e895..c24464cc16 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -94,6 +94,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
p_dst->p_pixels += p_src->i_pitch;
}
+// this threshold value is hardcoded in db_threshold in deinterlace_x86.asm
#define T 10
/**
* Internal helper function for EstimateNumBlocksWithMotion():
@@ -120,7 +121,7 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* @see EstimateNumBlocksWithMotion()
*/
static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
- int i_pitch_prev, int i_pitch_curr,
+ size_t i_pitch_prev, size_t i_pitch_curr,
int* pi_top, int* pi_bot )
{
/* Pixel luma/chroma difference threshold to detect motion. */
@@ -164,6 +165,9 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
Stellvia ep. 1, where the cadence would otherwise catch on incorrectly,
leading to more interlacing artifacts than by just using the emergency
mode frame composer.
+
+ Please note that these thresholds are hardcoded as dw_motion_thresh
+ in deinterlace_x86.asm
*/
(*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 );
@@ -172,78 +176,10 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
changes "enough". */
return (i_motion >= 8);
}
-
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
- int i_pitch_prev, int i_pitch_curr,
- int* pi_top, int* pi_bot )
-{
- int32_t i_motion = 0;
- int32_t i_top_motion = 0;
- int32_t i_bot_motion = 0;
-
- static const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
- pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
- movq_m2r( bT, mm5 );
-
- pxor_r2r( mm3, mm3 ); /* score (top field) */
- pxor_r2r( mm4, mm4 ); /* score (bottom field) */
- for( int y = 0; y < 8; y+=2 )
- {
- /* top field */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
-
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
-
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm3 ); /* add to top field score */
-
- p_pix_c += i_pitch_curr;
- p_pix_p += i_pitch_prev;
-
- /* bottom field - handling identical to top field, except... */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
-
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
-
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
-
- p_pix_c += i_pitch_curr;
- p_pix_p += i_pitch_prev;
- }
- movq_r2r( mm3, mm7 ); /* score (total) */
- paddd_r2r( mm4, mm7 );
- movd_r2m( mm3, i_top_motion );
- movd_r2m( mm4, i_bot_motion );
- movd_r2m( mm7, i_motion );
-
- /* The loop counts actual score * 255. */
- i_top_motion /= 255;
- i_bot_motion /= 255;
- i_motion /= 255;
-
- emms();
-
- (*pi_top) = ( i_top_motion >= 8 );
- (*pi_bot) = ( i_bot_motion >= 8 );
- return (i_motion >= 8);
-}
+#if HAVE_YASM
+int vlcpriv_test_motion_block_mmx2(uint8_t *p_pix_p, uint8_t *p_pix_c,
+ size_t i_pitch_prev, size_t i_pitch_curr,
+ int *pi_top, int *pi_bot);
#endif
#undef T
@@ -392,12 +328,12 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
if( p_prev->i_planes != p_curr->i_planes )
return -1;
- int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
+ int (*motion_in_block)(uint8_t *, uint8_t *, size_t , size_t, int *, int *) =
TestForMotionInBlock;
/* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
+#ifdef HAVE_YASM
if (vlc_CPU_MMXEXT())
- motion_in_block = TestForMotionInBlockMMX;
+ motion_in_block = vlcpriv_test_motion_block_mmx2;
#endif
int i_score = 0;
@@ -437,6 +373,10 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
}
}
}
+#ifdef HAVE_YASM
+ if (vlc_CPU_MMXEXT())
+ vlcpriv_emms_ext_asm();
+#endif
if( pi_top )
(*pi_top) = i_score_top;
--
2.11.0.rc2
More information about the vlc-devel
mailing list