[vlc-commits] [Git][videolan/vlc][master] 2 commits: deinterlace: purge MMX/MMXEXT
François Cartegnie (@fcartegnie)
gitlab at videolan.org
Tue Jul 20 15:13:20 UTC 2021
François Cartegnie pushed to branch master at VideoLAN / VLC
Commits:
831fa3cd by Lyndon Brown at 2021-07-20T15:02:06+00:00
deinterlace: purge MMX/MMXEXT
notes:
- this removes all SIMD acceleration for x86/x86_64. originally this work
started by converting the MMX code to SSE2, then purged remaining
artifacts, but a build error on android has blocked that work from
being merged for now. this commit thus takes a different approach of
simply purging the old MMX/MMXEXT code first, with getting the SSE2
implementation working to be done as a follow up.
- the `EndMMX()` function is retained (renamed to `EndSSE()`) because it
is still used under the merge code. the `emms` instruction will be
replaced with an `sfence` instruction separately, as more appropriate.
- - - - -
dd38fdc4 by Lyndon Brown at 2021-07-20T15:02:06+00:00
deinterlace: use sfence instead of emms for SSE2
we're purging all MMX/MMXEXT code; `sfence` is more appropriate.
- - - - -
14 changed files:
- modules/video_filter/Makefile.am
- modules/video_filter/deinterlace/algo_ivtc.c
- modules/video_filter/deinterlace/algo_phosphor.c
- modules/video_filter/deinterlace/algo_x.c
- modules/video_filter/deinterlace/algo_x.h
- modules/video_filter/deinterlace/algo_yadif.c
- modules/video_filter/deinterlace/deinterlace.c
- modules/video_filter/deinterlace/deinterlace.h
- modules/video_filter/deinterlace/helpers.c
- modules/video_filter/deinterlace/merge.c
- modules/video_filter/deinterlace/merge.h
- â modules/video_filter/deinterlace/mmx.h
- modules/video_filter/deinterlace/yadif.h
- modules/video_filter/deinterlace/yadif_x86.asm
Changes:
=====================================
modules/video_filter/Makefile.am
=====================================
@@ -159,7 +159,6 @@ noinst_LTLIBRARIES += libdeinterlace_common.la
libdeinterlace_plugin_la_SOURCES = \
video_filter/deinterlace/deinterlace.c video_filter/deinterlace/deinterlace.h \
- video_filter/deinterlace/mmx.h \
video_filter/deinterlace/merge.c video_filter/deinterlace/merge.h \
video_filter/deinterlace/helpers.c video_filter/deinterlace/helpers.h \
video_filter/deinterlace/algo_basic.c video_filter/deinterlace/algo_basic.h \
=====================================
modules/video_filter/deinterlace/algo_ivtc.c
=====================================
@@ -24,10 +24,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#include <stdint.h>
#include <assert.h>
=====================================
modules/video_filter/deinterlace/algo_phosphor.c
=====================================
@@ -24,11 +24,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-# include <stdalign.h>
-#endif
-
#include <stdint.h>
#include <assert.h>
@@ -87,7 +82,7 @@ static void DarkenField( picture_t *p_dst,
For luma, the operation is just a shift + bitwise AND, so we vectorize
even in the C version.
- There is an MMX version too, because it performs about twice faster.
+ There are SIMD versions too, which perform significantly faster.
*/
int i_plane = Y_PLANE;
uint8_t *p_out, *p_out_end;
@@ -120,7 +115,7 @@ static void DarkenField( picture_t *p_dst,
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
The chroma processing is a bit more complicated than luma,
- and needs MMX for vectorization.
+ and needs SIMD for vectorization.
*/
if( process_chroma )
{
@@ -148,129 +143,6 @@ static void DarkenField( picture_t *p_dst,
} /* if process_chroma */
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static void DarkenFieldMMX( picture_t *p_dst,
- const int i_field, const int i_strength,
- bool process_chroma )
-{
- assert( p_dst != NULL );
- assert( i_field == 0 || i_field == 1 );
- assert( i_strength >= 1 && i_strength <= 3 );
-
- uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
- const uint8_t remove_high_u8 = 0xFF >> i_strength;
- const uint64_t remove_high_u64 = remove_high_u8 *
- INT64_C(0x0101010101010101);
-
- int i_plane = Y_PLANE;
- uint8_t *p_out, *p_out_end;
- int w = p_dst->p[i_plane].i_visible_pitch;
- p_out = p_dst->p[i_plane].p_pixels;
- p_out_end = p_out + p_dst->p[i_plane].i_pitch
- * p_dst->p[i_plane].i_visible_lines;
-
- /* skip first line for bottom field */
- if( i_field == 1 )
- p_out += p_dst->p[i_plane].i_pitch;
-
- int wm8 = w % 8; /* remainder */
- int w8 = w - wm8; /* part of width that is divisible by 8 */
- for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
- {
- uint64_t *po = (uint64_t *)p_out;
- int x = 0;
-
- movq_m2r( i_strength_u64, mm1 );
- movq_m2r( remove_high_u64, mm2 );
- for( ; x < w8; x += 8 )
- {
- movq_m2r( (*po), mm0 );
-
- psrlq_r2r( mm1, mm0 );
- pand_r2r( mm2, mm0 );
-
- movq_r2m( mm0, (*po++) );
- }
-
- /* handle the width remainder */
- uint8_t *po_temp = (uint8_t *)po;
- for( ; x < w; ++x, ++po_temp )
- (*po_temp) = ( ((*po_temp) >> i_strength) & remove_high_u8 );
- }
-
- /* Process chroma if the field chromas are independent.
-
- The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
- The chroma processing is a bit more complicated than luma,
- and needs MMX for vectorization.
- */
- if( process_chroma )
- {
- for( i_plane++ /* luma already handled */;
- i_plane < p_dst->i_planes;
- i_plane++ )
- {
- w = p_dst->p[i_plane].i_visible_pitch;
- wm8 = w % 8; /* remainder */
- w8 = w - wm8; /* part of width that is divisible by 8 */
-
- p_out = p_dst->p[i_plane].p_pixels;
- p_out_end = p_out + p_dst->p[i_plane].i_pitch
- * p_dst->p[i_plane].i_visible_lines;
-
- /* skip first line for bottom field */
- if( i_field == 1 )
- p_out += p_dst->p[i_plane].i_pitch;
-
- for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
- {
- int x = 0;
-
- /* See also easy-to-read C version below. */
- static alignas (8) const mmx_t b128 = {
- .uq = 0x8080808080808080ULL
- };
-
- movq_m2r( b128, mm5 );
- movq_m2r( i_strength_u64, mm6 );
- movq_m2r( remove_high_u64, mm7 );
-
- uint64_t *po8 = (uint64_t *)p_out;
- for( ; x < w8; x += 8 )
- {
- movq_m2r( (*po8), mm0 );
-
- movq_r2r( mm5, mm2 ); /* 128 */
- movq_r2r( mm0, mm1 ); /* copy of data */
- psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
- psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
-
- /* >> i_strength */
- psrlq_r2r( mm6, mm1 );
- psrlq_r2r( mm6, mm2 );
- pand_r2r( mm7, mm1 );
- pand_r2r( mm7, mm2 );
-
- /* collect results from pos./neg. parts */
- psubb_r2r( mm2, mm1 );
- paddb_r2r( mm5, mm1 );
-
- movq_r2m( mm1, (*po8++) );
- }
-
- /* C version - handle the width remainder */
- uint8_t *po = p_out;
- for( ; x < w; ++x, ++po )
- (*po) = 128 + ( ((*po) - 128) / (1 << i_strength) );
- } /* for p_out... */
- } /* for i_plane... */
- } /* if process_chroma */
-
- emms();
-}
-#endif
-
/*****************************************************************************
* Public functions
*****************************************************************************/
@@ -357,13 +229,6 @@ int RenderPhosphor( filter_t *p_filter,
*/
if( p_sys->phosphor.i_dimmer_strength > 0 )
{
-#ifdef CAN_COMPILE_MMXEXT
- if( vlc_CPU_MMXEXT() )
- DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
- p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
- p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
- else
-#endif
DarkenField( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
=====================================
modules/video_filter/deinterlace/algo_x.c
=====================================
@@ -24,10 +24,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#include <stdint.h>
#include <vlc_common.h>
@@ -76,71 +72,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
return fc < 1 ? false : true;
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
-{
-
- int y, x;
- int32_t ff, fr;
- int fc;
-
- /* Detect interlacing */
- fc = 0;
- pxor_r2r( mm7, mm7 );
- for( y = 0; y < 9; y += 2 )
- {
- ff = fr = 0;
- pxor_r2r( mm5, mm5 );
- pxor_r2r( mm6, mm6 );
- for( x = 0; x < 8; x+=4 )
- {
- movd_m2r( src[ x], mm0 );
- movd_m2r( src[1*i_src+x], mm1 );
- movd_m2r( src[2*i_src+x], mm2 );
- movd_m2r( src[3*i_src+x], mm3 );
-
- punpcklbw_r2r( mm7, mm0 );
- punpcklbw_r2r( mm7, mm1 );
- punpcklbw_r2r( mm7, mm2 );
- punpcklbw_r2r( mm7, mm3 );
-
- movq_r2r( mm0, mm4 );
-
- psubw_r2r( mm1, mm0 );
- psubw_r2r( mm2, mm4 );
-
- psubw_r2r( mm1, mm2 );
- psubw_r2r( mm1, mm3 );
-
- pmaddwd_r2r( mm0, mm0 );
- pmaddwd_r2r( mm4, mm4 );
- pmaddwd_r2r( mm2, mm2 );
- pmaddwd_r2r( mm3, mm3 );
- paddd_r2r( mm0, mm2 );
- paddd_r2r( mm4, mm3 );
- paddd_r2r( mm2, mm5 );
- paddd_r2r( mm3, mm6 );
- }
-
- movq_r2r( mm5, mm0 );
- psrlq_i2r( 32, mm0 );
- paddd_r2r( mm0, mm5 );
- movd_r2m( mm5, fr );
-
- movq_r2r( mm6, mm0 );
- psrlq_i2r( 32, mm0 );
- paddd_r2r( mm0, mm6 );
- movd_r2m( mm6, ff );
-
- if( ff < 6*fr/8 && fr > 32 )
- fc++;
-
- src += 2*i_src;
- }
- return fc;
-}
-#endif
static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
@@ -163,49 +94,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
- uint8_t *src1, int i_src1,
- uint8_t *src2, int i_src2 )
-{
- static const uint64_t m_4 = INT64_C(0x0004000400040004);
- int y, x;
-
- /* Progressive */
- pxor_r2r( mm7, mm7 );
- for( y = 0; y < 8; y += 2 )
- {
- for( x = 0; x < 8; x +=4 )
- {
- movd_m2r( src1[x], mm0 );
- movd_r2m( mm0, dst[x] );
-
- movd_m2r( src2[x], mm1 );
- movd_m2r( src1[i_src1+x], mm2 );
-
- punpcklbw_r2r( mm7, mm0 );
- punpcklbw_r2r( mm7, mm1 );
- punpcklbw_r2r( mm7, mm2 );
- paddw_r2r( mm1, mm1 );
- movq_r2r( mm1, mm3 );
- paddw_r2r( mm3, mm3 );
- paddw_r2r( mm2, mm0 );
- paddw_r2r( mm3, mm1 );
- paddw_m2r( m_4, mm1 );
- paddw_r2r( mm1, mm0 );
- psraw_i2r( 3, mm0 );
- packuswb_r2r( mm7, mm0 );
- movd_r2m( mm0, dst[i_dst+x] );
- }
- dst += 2*i_dst;
- src1 += i_src1;
- src2 += i_src2;
- }
-}
-
-#endif
-
/* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
* neighbour
* (Use 8x9 pixels)
@@ -229,31 +117,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src )
-{
- int y;
-
- /* Interlaced */
- for( y = 0; y < 8; y += 2 )
- {
- movq_m2r( src[0], mm0 );
- movq_r2m( mm0, dst[0] );
- dst += i_dst;
-
- movq_m2r( src[2*i_src], mm1 );
- pavgb_r2r( mm1, mm0 );
-
- movq_r2m( mm0, dst[0] );
-
- dst += 1*i_dst;
- src += 2*i_src;
- }
-}
-#endif
-
/* XDeint8x8Field: Edge oriented interpolation
* (Need -4 and +5 pixels H, +1 line)
*/
@@ -271,7 +134,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
for( x = 0; x < 8; x++ )
{
uint8_t *src2 = &src[2*i_src];
- /* I use 8 pixels just to match the MMX version, but it's overkill
+ /* I use 8 pixels just to match the SIMD version, but it's overkill
* 5 would be enough (less isn't good) */
const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
@@ -301,50 +164,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src )
-{
- int y, x;
-
- /* Interlaced */
- for( y = 0; y < 8; y += 2 )
- {
- memcpy( dst, src, 8 );
- dst += i_dst;
-
- for( x = 0; x < 8; x++ )
- {
- uint8_t *src2 = &src[2*i_src];
- int32_t c0, c1, c2;
-
- movq_m2r( src[x-2], mm0 );
- movq_m2r( src[x-3], mm1 );
- movq_m2r( src[x-4], mm2 );
-
- psadbw_m2r( src2[x-4], mm0 );
- psadbw_m2r( src2[x-3], mm1 );
- psadbw_m2r( src2[x-2], mm2 );
-
- movd_r2m( mm0, c2 );
- movd_r2m( mm1, c1 );
- movd_r2m( mm2, c0 );
-
- if( c0 < c1 && c1 <= c2 )
- dst[x] = (src[x-1] + src2[x+1]) >> 1;
- else if( c2 < c1 && c1 <= c0 )
- dst[x] = (src[x+1] + src2[x-1]) >> 1;
- else
- dst[x] = (src[x+0] + src2[x+0]) >> 1;
- }
-
- dst += 1*i_dst;
- src += 2*i_src;
- }
-}
-#endif
-
/* NxN arbitray size (and then only use pixel in the NxN block)
*/
static inline int XDeintNxNDetect( uint8_t *src, int i_src,
@@ -472,41 +291,6 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
- uint8_t *src, int i_src,
- const int i_mbx, int i_modx )
-{
- int x;
-
- /* Reset current line */
- for( x = 0; x < i_mbx; x++ )
- {
- int s;
- if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
- {
- if( x == 0 || x == i_mbx - 1 )
- XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
- else
- XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
- }
- else
- {
- XDeint8x8MergeMMXEXT( dst, i_dst,
- &src[0*i_src], 2*i_src,
- &src[1*i_src], 2*i_src );
- }
-
- dst += 8;
- src += 8;
- }
-
- if( i_modx )
- XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
-}
-#endif
-
/*****************************************************************************
* Public functions
*****************************************************************************/
@@ -515,9 +299,6 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
{
VLC_UNUSED(p_filter);
int i_plane;
-#if defined (CAN_COMPILE_MMXEXT)
- const bool mmxext = vlc_CPU_MMXEXT();
-#endif
/* Copy image and skip lines */
for( i_plane = 0 ; i_plane < p_pic->i_planes ; i_plane++ )
@@ -538,12 +319,7 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
-#ifdef CAN_COMPILE_MMXEXT
- if( mmxext )
- XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
- else
-#endif
- XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
+ XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
}
/* Last line (C only)*/
@@ -565,9 +341,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
}
}
-#ifdef CAN_COMPILE_MMXEXT
- if( mmxext )
- emms();
-#endif
return VLC_SUCCESS;
}
=====================================
modules/video_filter/deinterlace/algo_x.h
=====================================
@@ -33,13 +33,13 @@ struct picture_t;
/**
* Interpolating deinterlace filter "X".
*
- * The algorithm works on a 8x8 block basic, it copies the top field
+ * The algorithm works on a 8x8 block basis; It copies the top field
* and applies a process to recreate the bottom field.
*
* If a 8x8 block is classified as :
* - progressive: it applies a small blend (1,6,1)
* - interlaced:
- * * in the MMX version: we do a ME between the 2 fields, if there is a
+ * * in the SIMD version: we do a ME between the 2 fields, if there is a
* good match we use MC to recreate the bottom field (with a small
* blend (1,6,1) )
* * otherwise: it recreates the bottom field by an edge oriented
=====================================
modules/video_filter/deinterlace/algo_yadif.c
=====================================
@@ -119,11 +119,6 @@ int RenderYadif( filter_t *p_filter, picture_t *p_dst, picture_t *p_src,
if( vlc_CPU_SSE2() )
filter = vlcpriv_yadif_filter_line_sse2;
else
-#if defined(__i386__)
- if( vlc_CPU_MMXEXT() )
- filter = vlcpriv_yadif_filter_line_mmxext;
- else
-#endif
#endif
filter = yadif_filter_line_c;
=====================================
modules/video_filter/deinterlace/deinterlace.c
=====================================
@@ -558,15 +558,7 @@ notsupp:
if( vlc_CPU_SSE2() )
{
p_sys->pf_merge = pixel_size == 1 ? Merge8BitSSE2 : Merge16BitSSE2;
- p_sys->pf_end_merge = EndMMX;
- }
- else
-#endif
-#if defined(CAN_COMPILE_MMXEXT)
- if( pixel_size == 1 && vlc_CPU_MMXEXT() )
- {
- p_sys->pf_merge = MergeMMXEXT;
- p_sys->pf_end_merge = EndMMX;
+ p_sys->pf_end_merge = EndSSE;
}
else
#endif
=====================================
modules/video_filter/deinterlace/deinterlace.h
=====================================
@@ -68,7 +68,7 @@ typedef struct
{
const vlc_chroma_description_t *chroma;
- /** Merge routine: C, MMX, SSE, ALTIVEC, NEON, ... */
+ /** Merge routine: C, SSE, ALTIVEC, NEON, ... */
void (*pf_merge) ( void *, const void *, const void *, size_t );
#if defined (__i386__) || defined (__x86_64__)
/** Merge finalization routine for SSE */
=====================================
modules/video_filter/deinterlace/helpers.c
=====================================
@@ -24,11 +24,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-# include <stdalign.h>
-#endif
-
#include <stdint.h>
#include <assert.h>
@@ -107,9 +102,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* For interpretation of pi_top and pi_bot, it is assumed that the block
* starts on an even-numbered line (belonging to the top field).
*
- * The b_mmx parameter avoids the need to call vlc_CPU() separately
- * for each block.
- *
* @param[in] p_pix_p Base pointer to the block in previous picture
* @param[in] p_pix_c Base pointer to the same block in current picture
* @param i_pitch_prev i_pitch of previous picture
@@ -172,79 +164,6 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
changes "enough". */
return (i_motion >= 8);
}
-
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
- int i_pitch_prev, int i_pitch_curr,
- int* pi_top, int* pi_bot )
-{
- int32_t i_motion = 0;
- int32_t i_top_motion = 0;
- int32_t i_bot_motion = 0;
-
- static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
- pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
- movq_m2r( bT, mm5 );
-
- pxor_r2r( mm3, mm3 ); /* score (top field) */
- pxor_r2r( mm4, mm4 ); /* score (bottom field) */
- for( int y = 0; y < 8; y+=2 )
- {
- /* top field */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
-
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
-
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm3 ); /* add to top field score */
-
- p_pix_c += i_pitch_curr;
- p_pix_p += i_pitch_prev;
-
- /* bottom field - handling identical to top field, except... */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
-
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
-
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
-
- p_pix_c += i_pitch_curr;
- p_pix_p += i_pitch_prev;
- }
- movq_r2r( mm3, mm7 ); /* score (total) */
- paddd_r2r( mm4, mm7 );
- movd_r2m( mm3, i_top_motion );
- movd_r2m( mm4, i_bot_motion );
- movd_r2m( mm7, i_motion );
-
- /* The loop counts actual score * 255. */
- i_top_motion /= 255;
- i_bot_motion /= 255;
- i_motion /= 255;
-
- emms();
-
- (*pi_top) = ( i_top_motion >= 8 );
- (*pi_bot) = ( i_bot_motion >= 8 );
- return (i_motion >= 8);
-}
-#endif
#undef T
/*****************************************************************************
@@ -396,11 +315,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
TestForMotionInBlock;
- /* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
- if (vlc_CPU_MMXEXT())
- motion_in_block = TestForMotionInBlockMMX;
-#endif
int i_score = 0;
for( int i_plane = 0 ; i_plane < p_prev->i_planes ; i_plane++ )
@@ -451,142 +365,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
/* Threshold (value from Transcode 1.1.5) */
#define T 100
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
- const picture_t* p_pic_bot )
-{
- assert( p_pic_top->i_planes == p_pic_bot->i_planes );
-
- /* Amount of bits must be known for MMX, thus int32_t.
- Doesn't hurt the C implementation. */
- int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
- int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
-
- pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
-
- for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
- {
- /* Sanity check */
- if( p_pic_top->p[i_plane].i_visible_lines !=
- p_pic_bot->p[i_plane].i_visible_lines )
- return -1;
-
- const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
- const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
- p_pic_bot->p[i_plane].i_visible_pitch );
- const int wm8 = w % 8; /* remainder */
- const int w8 = w - wm8; /* part of width that is divisible by 8 */
-
- /* Current line / neighbouring lines picture pointers */
- const picture_t *cur = p_pic_bot;
- const picture_t *ngh = p_pic_top;
- int wc = cur->p[i_plane].i_pitch;
- int wn = ngh->p[i_plane].i_pitch;
-
- /* Transcode 1.1.5 only checks every other line. Checking every line
- works better for anime, which may contain horizontal,
- one pixel thick cartoon outlines.
- */
- for( int y = 1; y < i_lasty; ++y )
- {
- uint8_t *p_c = &cur->p[i_plane].p_pixels[y*wc]; /* this line */
- uint8_t *p_p = &ngh->p[i_plane].p_pixels[(y-1)*wn]; /* prev line */
- uint8_t *p_n = &ngh->p[i_plane].p_pixels[(y+1)*wn]; /* next line */
-
- int x = 0;
-
- /* Easy-to-read C version further below.
-
- Assumptions: 0 < T < 127
- # of pixels < (2^32)/255
- Note: calculates score * 255
- */
- static alignas (8) const mmx_t b0 = {
- .uq = 0x0000000000000000ULL };
- static alignas (8) const mmx_t b128 = {
- .uq = 0x8080808080808080ULL };
- static alignas (8) const mmx_t bT = {
- .ub = { T, T, T, T, T, T, T, T } };
-
- for( ; x < w8; x += 8 )
- {
- movq_m2r( *((int64_t*)p_c), mm0 );
- movq_m2r( *((int64_t*)p_p), mm1 );
- movq_m2r( *((int64_t*)p_n), mm2 );
-
- psubb_m2r( b128, mm0 );
- psubb_m2r( b128, mm1 );
- psubb_m2r( b128, mm2 );
-
- psubsb_r2r( mm0, mm1 );
- psubsb_r2r( mm0, mm2 );
-
- pxor_r2r( mm3, mm3 );
- pxor_r2r( mm4, mm4 );
- pxor_r2r( mm5, mm5 );
- pxor_r2r( mm6, mm6 );
-
- punpcklbw_r2r( mm1, mm3 );
- punpcklbw_r2r( mm2, mm4 );
- punpckhbw_r2r( mm1, mm5 );
- punpckhbw_r2r( mm2, mm6 );
-
- pmulhw_r2r( mm3, mm4 );
- pmulhw_r2r( mm5, mm6 );
-
- packsswb_r2r(mm4, mm6);
- pcmpgtb_m2r( bT, mm6 );
- psadbw_m2r( b0, mm6 );
- paddd_r2r( mm6, mm7 );
-
- p_c += 8;
- p_p += 8;
- p_n += 8;
- }
-
- for( ; x < w; ++x )
- {
- /* Worst case: need 17 bits for "comb". */
- int_fast32_t C = *p_c;
- int_fast32_t P = *p_p;
- int_fast32_t N = *p_n;
-
- /* Comments in Transcode's filter_ivtc.c attribute this
- combing metric to Gunnar Thalin.
-
- The idea is that if the picture is interlaced, both
- expressions will have the same sign, and this comes
- up positive. The value T = 100 has been chosen such
- that a pixel difference of 10 (on average) will
- trigger the detector.
- */
- int_fast32_t comb = (P - C) * (N - C);
- if( comb > T )
- ++i_score_c;
-
- ++p_c;
- ++p_p;
- ++p_n;
- }
-
- /* Now the other field - swap current and neighbour pictures */
- const picture_t *tmp = cur;
- cur = ngh;
- ngh = tmp;
- int tmp_pitch = wc;
- wc = wn;
- wn = tmp_pitch;
- }
- }
-
- movd_r2m( mm7, i_score_mmx );
- emms();
-
- return i_score_mmx/255 + i_score_c;
-}
-#endif
-
/* See header for function doc. */
int CalculateInterlaceScore( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
@@ -607,11 +385,6 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
-#ifdef CAN_COMPILE_MMXEXT
- if (vlc_CPU_MMXEXT())
- return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
-#endif
-
int32_t i_score = 0;
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
=====================================
modules/video_filter/deinterlace/merge.c
=====================================
@@ -33,10 +33,6 @@
#include <vlc_cpu.h>
#include "merge.h"
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#ifdef HAVE_ALTIVEC_H
# include <altivec.h>
#endif
@@ -67,32 +63,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1,
*p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
}
-#if defined(CAN_COMPILE_MMXEXT)
-VLC_MMX
-void MergeMMXEXT( void *_p_dest, const void *_p_s1, const void *_p_s2,
- size_t i_bytes )
-{
- uint8_t *p_dest = _p_dest;
- const uint8_t *p_s1 = _p_s1;
- const uint8_t *p_s2 = _p_s2;
-
- for( ; i_bytes >= 8; i_bytes -= 8 )
- {
- __asm__ __volatile__( "movq %2,%%mm1;"
- "pavgb %1, %%mm1;"
- "movq %%mm1, %0" :"=m" (*p_dest):
- "m" (*p_s1),
- "m" (*p_s2) : "mm1" );
- p_dest += 8;
- p_s1 += 8;
- p_s2 += 8;
- }
-
- for( ; i_bytes > 0; i_bytes-- )
- *p_dest++ = ( *p_s1++ + *p_s2++ ) >> 1;
-}
-#endif
-
#if defined(CAN_COMPILE_SSE)
VLC_SSE
void Merge8BitSSE2( void *_p_dest, const void *_p_s1, const void *_p_s2,
@@ -223,9 +193,9 @@ void MergeAltivec( void *_p_dest, const void *_p_s1,
* EndMerge routines
*****************************************************************************/
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
-void EndMMX( void )
+#if defined(CAN_COMPILE_SSE2)
+void EndSSE( void )
{
- __asm__ __volatile__( "emms" :: );
+ __asm__ __volatile__( "sfence" ::: "memory" );
}
#endif
=====================================
modules/video_filter/deinterlace/merge.h
=====================================
@@ -112,18 +112,6 @@ void Merge16BitGeneric( void *_p_dest, const void *_p_s1, const void *_p_s2,
void MergeAltivec ( void *, const void *, const void *, size_t );
#endif
-#if defined(CAN_COMPILE_MMXEXT)
-/**
- * MMXEXT routine to blend pixels from two picture lines.
- *
- * @param _p_dest Target
- * @param _p_s1 Source line A
- * @param _p_s2 Source line B
- * @param i_bytes Number of bytes to merge
- */
-void MergeMMXEXT ( void *, const void *, const void *, size_t );
-#endif
-
#if defined(CAN_COMPILE_SSE)
/**
* SSE2 routine to blend pixels from two picture lines.
@@ -175,17 +163,17 @@ void merge16_arm_sve(void *, const void *, const void *, size_t);
* EndMerge routines
*****************************************************************************/
-#if defined(CAN_COMPILE_MMXEXT) || defined(CAN_COMPILE_SSE)
+#if defined(CAN_COMPILE_SSE2)
/**
- * MMX merge finalization routine.
+ * SSE merge finalization routine.
*
- * Must be called after an MMX merge is finished.
- * This exits MMX mode (by executing the "emms" instruction).
+ * Should be called after an SSE merge is finished.
+ * This exits SSE mode (by executing the "sfence" instruction).
*
* The EndMerge() macro detects whether this is needed, and calls if it is,
* so just use that.
*/
-void EndMMX ( void );
+void EndSSE( void );
#endif
#endif
=====================================
modules/video_filter/deinterlace/mmx.h deleted
=====================================
@@ -1,256 +0,0 @@
-/*
- * mmx.h
- * Copyright (C) 1997-1999 H. Dietz and R. Fisher
- *
- * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
- *
- * mpeg2dec is free software; you can redistribute it and/or modify
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * mpeg2dec is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-/*
- * The type of an value that fits in an MMX register (note that long
- * long constant values MUST be suffixed by LL and unsigned long long
- * values by ULL, lest they be truncated by the compiler)
- */
-
-#include <stdint.h>
-
-typedef union {
- int64_t q; /* Quadword (64-bit) value */
- uint64_t uq; /* Unsigned Quadword */
- int32_t d[2]; /* 2 Doubleword (32-bit) values */
- uint32_t ud[2]; /* 2 Unsigned Doubleword */
- int16_t w[4]; /* 4 Word (16-bit) values */
- uint16_t uw[4]; /* 4 Unsigned Word */
- int8_t b[8]; /* 8 Byte (8-bit) values */
- uint8_t ub[8]; /* 8 Unsigned Byte */
- float s[2]; /* Single-precision (32-bit) value */
-} mmx_t; /* NOTE: must be on an 8-byte (64-bit) boundary */
-
-
-#define mmx_i2r(op,imm,reg) \
- __asm__ __volatile__ (#op " %0, %%" #reg \
- : /* nothing */ \
- : "i" (imm) \
- : #reg)
-
-#define mmx_m2r(op,mem,reg) \
- __asm__ __volatile__ (#op " %0, %%" #reg \
- : /* nothing */ \
- : "m" (mem) \
- : #reg)
-
-#define mmx_r2m(op,reg,mem) \
- __asm__ __volatile__ (#op " %%" #reg ", %0" \
- : "=m" (mem) \
- : /* nothing */ \
- : "memory")
-
-#define mmx_r2r(op,regs,regd) \
- __asm__ __volatile__ (#op " %%" #regs ", %%" #regd ::: #regd)
-
-
-#define emms() __asm__ __volatile__ ("emms")
-
-#define movd_m2r(var,reg) mmx_m2r (movd, var, reg)
-#define movd_r2m(reg,var) mmx_r2m (movd, reg, var)
-#define movd_r2r(regs,regd) mmx_r2r (movd, regs, regd)
-
-#define movq_m2r(var,reg) mmx_m2r (movq, var, reg)
-#define movq_r2m(reg,var) mmx_r2m (movq, reg, var)
-#define movq_r2r(regs,regd) mmx_r2r (movq, regs, regd)
-
-#define packssdw_m2r(var,reg) mmx_m2r (packssdw, var, reg)
-#define packssdw_r2r(regs,regd) mmx_r2r (packssdw, regs, regd)
-#define packsswb_m2r(var,reg) mmx_m2r (packsswb, var, reg)
-#define packsswb_r2r(regs,regd) mmx_r2r (packsswb, regs, regd)
-
-#define packuswb_m2r(var,reg) mmx_m2r (packuswb, var, reg)
-#define packuswb_r2r(regs,regd) mmx_r2r (packuswb, regs, regd)
-
-#define paddb_m2r(var,reg) mmx_m2r (paddb, var, reg)
-#define paddb_r2r(regs,regd) mmx_r2r (paddb, regs, regd)
-#define paddd_m2r(var,reg) mmx_m2r (paddd, var, reg)
-#define paddd_r2r(regs,regd) mmx_r2r (paddd, regs, regd)
-#define paddw_m2r(var,reg) mmx_m2r (paddw, var, reg)
-#define paddw_r2r(regs,regd) mmx_r2r (paddw, regs, regd)
-
-#define paddsb_m2r(var,reg) mmx_m2r (paddsb, var, reg)
-#define paddsb_r2r(regs,regd) mmx_r2r (paddsb, regs, regd)
-#define paddsw_m2r(var,reg) mmx_m2r (paddsw, var, reg)
-#define paddsw_r2r(regs,regd) mmx_r2r (paddsw, regs, regd)
-
-#define paddusb_m2r(var,reg) mmx_m2r (paddusb, var, reg)
-#define paddusb_r2r(regs,regd) mmx_r2r (paddusb, regs, regd)
-#define paddusw_m2r(var,reg) mmx_m2r (paddusw, var, reg)
-#define paddusw_r2r(regs,regd) mmx_r2r (paddusw, regs, regd)
-
-#define pand_m2r(var,reg) mmx_m2r (pand, var, reg)
-#define pand_r2r(regs,regd) mmx_r2r (pand, regs, regd)
-
-#define pandn_m2r(var,reg) mmx_m2r (pandn, var, reg)
-#define pandn_r2r(regs,regd) mmx_r2r (pandn, regs, regd)
-
-#define pcmpeqb_m2r(var,reg) mmx_m2r (pcmpeqb, var, reg)
-#define pcmpeqb_r2r(regs,regd) mmx_r2r (pcmpeqb, regs, regd)
-#define pcmpeqd_m2r(var,reg) mmx_m2r (pcmpeqd, var, reg)
-#define pcmpeqd_r2r(regs,regd) mmx_r2r (pcmpeqd, regs, regd)
-#define pcmpeqw_m2r(var,reg) mmx_m2r (pcmpeqw, var, reg)
-#define pcmpeqw_r2r(regs,regd) mmx_r2r (pcmpeqw, regs, regd)
-
-#define pcmpgtb_m2r(var,reg) mmx_m2r (pcmpgtb, var, reg)
-#define pcmpgtb_r2r(regs,regd) mmx_r2r (pcmpgtb, regs, regd)
-#define pcmpgtd_m2r(var,reg) mmx_m2r (pcmpgtd, var, reg)
-#define pcmpgtd_r2r(regs,regd) mmx_r2r (pcmpgtd, regs, regd)
-#define pcmpgtw_m2r(var,reg) mmx_m2r (pcmpgtw, var, reg)
-#define pcmpgtw_r2r(regs,regd) mmx_r2r (pcmpgtw, regs, regd)
-
-#define pmaddwd_m2r(var,reg) mmx_m2r (pmaddwd, var, reg)
-#define pmaddwd_r2r(regs,regd) mmx_r2r (pmaddwd, regs, regd)
-
-#define pmulhw_m2r(var,reg) mmx_m2r (pmulhw, var, reg)
-#define pmulhw_r2r(regs,regd) mmx_r2r (pmulhw, regs, regd)
-
-#define pmullw_m2r(var,reg) mmx_m2r (pmullw, var, reg)
-#define pmullw_r2r(regs,regd) mmx_r2r (pmullw, regs, regd)
-
-#define por_m2r(var,reg) mmx_m2r (por, var, reg)
-#define por_r2r(regs,regd) mmx_r2r (por, regs, regd)
-
-#define pslld_i2r(imm,reg) mmx_i2r (pslld, imm, reg)
-#define pslld_m2r(var,reg) mmx_m2r (pslld, var, reg)
-#define pslld_r2r(regs,regd) mmx_r2r (pslld, regs, regd)
-#define psllq_i2r(imm,reg) mmx_i2r (psllq, imm, reg)
-#define psllq_m2r(var,reg) mmx_m2r (psllq, var, reg)
-#define psllq_r2r(regs,regd) mmx_r2r (psllq, regs, regd)
-#define psllw_i2r(imm,reg) mmx_i2r (psllw, imm, reg)
-#define psllw_m2r(var,reg) mmx_m2r (psllw, var, reg)
-#define psllw_r2r(regs,regd) mmx_r2r (psllw, regs, regd)
-
-#define psrad_i2r(imm,reg) mmx_i2r (psrad, imm, reg)
-#define psrad_m2r(var,reg) mmx_m2r (psrad, var, reg)
-#define psrad_r2r(regs,regd) mmx_r2r (psrad, regs, regd)
-#define psraw_i2r(imm,reg) mmx_i2r (psraw, imm, reg)
-#define psraw_m2r(var,reg) mmx_m2r (psraw, var, reg)
-#define psraw_r2r(regs,regd) mmx_r2r (psraw, regs, regd)
-
-#define psrld_i2r(imm,reg) mmx_i2r (psrld, imm, reg)
-#define psrld_m2r(var,reg) mmx_m2r (psrld, var, reg)
-#define psrld_r2r(regs,regd) mmx_r2r (psrld, regs, regd)
-#define psrlq_i2r(imm,reg) mmx_i2r (psrlq, imm, reg)
-#define psrlq_m2r(var,reg) mmx_m2r (psrlq, var, reg)
-#define psrlq_r2r(regs,regd) mmx_r2r (psrlq, regs, regd)
-#define psrlw_i2r(imm,reg) mmx_i2r (psrlw, imm, reg)
-#define psrlw_m2r(var,reg) mmx_m2r (psrlw, var, reg)
-#define psrlw_r2r(regs,regd) mmx_r2r (psrlw, regs, regd)
-
-#define psubb_m2r(var,reg) mmx_m2r (psubb, var, reg)
-#define psubb_r2r(regs,regd) mmx_r2r (psubb, regs, regd)
-#define psubd_m2r(var,reg) mmx_m2r (psubd, var, reg)
-#define psubd_r2r(regs,regd) mmx_r2r (psubd, regs, regd)
-#define psubw_m2r(var,reg) mmx_m2r (psubw, var, reg)
-#define psubw_r2r(regs,regd) mmx_r2r (psubw, regs, regd)
-
-#define psubsb_m2r(var,reg) mmx_m2r (psubsb, var, reg)
-#define psubsb_r2r(regs,regd) mmx_r2r (psubsb, regs, regd)
-#define psubsw_m2r(var,reg) mmx_m2r (psubsw, var, reg)
-#define psubsw_r2r(regs,regd) mmx_r2r (psubsw, regs, regd)
-
-#define psubusb_m2r(var,reg) mmx_m2r (psubusb, var, reg)
-#define psubusb_r2r(regs,regd) mmx_r2r (psubusb, regs, regd)
-#define psubusw_m2r(var,reg) mmx_m2r (psubusw, var, reg)
-#define psubusw_r2r(regs,regd) mmx_r2r (psubusw, regs, regd)
-
-#define punpckhbw_m2r(var,reg) mmx_m2r (punpckhbw, var, reg)
-#define punpckhbw_r2r(regs,regd) mmx_r2r (punpckhbw, regs, regd)
-#define punpckhdq_m2r(var,reg) mmx_m2r (punpckhdq, var, reg)
-#define punpckhdq_r2r(regs,regd) mmx_r2r (punpckhdq, regs, regd)
-#define punpckhwd_m2r(var,reg) mmx_m2r (punpckhwd, var, reg)
-#define punpckhwd_r2r(regs,regd) mmx_r2r (punpckhwd, regs, regd)
-
-#define punpcklbw_m2r(var,reg) mmx_m2r (punpcklbw, var, reg)
-#define punpcklbw_r2r(regs,regd) mmx_r2r (punpcklbw, regs, regd)
-#define punpckldq_m2r(var,reg) mmx_m2r (punpckldq, var, reg)
-#define punpckldq_r2r(regs,regd) mmx_r2r (punpckldq, regs, regd)
-#define punpcklwd_m2r(var,reg) mmx_m2r (punpcklwd, var, reg)
-#define punpcklwd_r2r(regs,regd) mmx_r2r (punpcklwd, regs, regd)
-
-#define pxor_m2r(var,reg) mmx_m2r (pxor, var, reg)
-#define pxor_r2r(regs,regd) mmx_r2r (pxor, regs, regd)
-
-
-/* AMD MMX extensions - also available in intel SSE */
-
-
-#define mmx_m2ri(op,mem,reg,imm) \
- __asm__ __volatile__ (#op " %1, %0, %%" #reg \
- : /* nothing */ \
- : "X" (mem), "X" (imm) \
- : #reg)
-#define mmx_r2ri(op,regs,regd,imm) \
- __asm__ __volatile__ (#op " %0, %%" #regs ", %%" #regd \
- : /* nothing */ \
- : "X" (imm) \
- : #regd)
-
-#define mmx_fetch(mem,hint) \
- __asm__ __volatile__ ("prefetch" #hint " %0" \
- : /* nothing */ \
- : "X" (mem))
-
-
-#define maskmovq(regs,maskreg) mmx_r2ri (maskmovq, regs, maskreg)
-
-#define movntq_r2m(mmreg,var) mmx_r2m (movntq, mmreg, var)
-
-#define pavgb_m2r(var,reg) mmx_m2r (pavgb, var, reg)
-#define pavgb_r2r(regs,regd) mmx_r2r (pavgb, regs, regd)
-#define pavgw_m2r(var,reg) mmx_m2r (pavgw, var, reg)
-#define pavgw_r2r(regs,regd) mmx_r2r (pavgw, regs, regd)
-
-#define pextrw_r2r(mmreg,reg,imm) mmx_r2ri (pextrw, mmreg, reg, imm)
-
-#define pinsrw_r2r(reg,mmreg,imm) mmx_r2ri (pinsrw, reg, mmreg, imm)
-
-#define pmaxsw_m2r(var,reg) mmx_m2r (pmaxsw, var, reg)
-#define pmaxsw_r2r(regs,regd) mmx_r2r (pmaxsw, regs, regd)
-
-#define pmaxub_m2r(var,reg) mmx_m2r (pmaxub, var, reg)
-#define pmaxub_r2r(regs,regd) mmx_r2r (pmaxub, regs, regd)
-
-#define pminsw_m2r(var,reg) mmx_m2r (pminsw, var, reg)
-#define pminsw_r2r(regs,regd) mmx_r2r (pminsw, regs, regd)
-
-#define pminub_m2r(var,reg) mmx_m2r (pminub, var, reg)
-#define pminub_r2r(regs,regd) mmx_r2r (pminub, regs, regd)
-
-#define pmovmskb(mmreg,reg) \
- __asm__ __volatile__ ("movmskps %" #mmreg ", %" #reg : : : #reg)
-
-#define pmulhuw_m2r(var,reg) mmx_m2r (pmulhuw, var, reg)
-#define pmulhuw_r2r(regs,regd) mmx_r2r (pmulhuw, regs, regd)
-
-#define prefetcht0(mem) mmx_fetch (mem, t0)
-#define prefetcht1(mem) mmx_fetch (mem, t1)
-#define prefetcht2(mem) mmx_fetch (mem, t2)
-#define prefetchnta(mem) mmx_fetch (mem, nta)
-
-#define psadbw_m2r(var,reg) mmx_m2r (psadbw, var, reg)
-#define psadbw_r2r(regs,regd) mmx_r2r (psadbw, regs, regd)
-
-#define pshufw_m2r(var,reg,imm) mmx_m2ri(pshufw, var, reg, imm)
-#define pshufw_r2r(regs,regd,imm) mmx_r2ri(pshufw, regs, regd, imm)
-
-#define sfence() __asm__ __volatile__ ("sfence\n\t")
=====================================
modules/video_filter/deinterlace/yadif.h
=====================================
@@ -97,6 +97,3 @@ static void yadif_filter_line_c_16bit(uint8_t *dst8, uint8_t *prev8, uint8_t *cu
void vlcpriv_yadif_filter_line_ssse3(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
void vlcpriv_yadif_filter_line_sse2(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
#endif
-#if defined(__i386__)
-void vlcpriv_yadif_filter_line_mmxext(uint8_t *dst, uint8_t *prev, uint8_t *cur, uint8_t *next, int w, int prefs, int mrefs, int parity, int mode);
-#endif
=====================================
modules/video_filter/deinterlace/yadif_x86.asm
=====================================
@@ -248,9 +248,6 @@ cglobal yadif_filter_line, 4, 7, 8, 80, dst, prev, cur, next, w, prefs, \
FILTER 0, curq, nextq
.ret:
-%if mmsize == 8
- emms
-%endif
RET
%if ARCH_X86_32
%undef pb_1
@@ -262,7 +259,3 @@ INIT_XMM ssse3
YADIF
INIT_XMM sse2
YADIF
-%if ARCH_X86_32
-INIT_MMX mmxext
-YADIF
-%endif
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/e6bb48cc1568fdebabba810718c8c235abe0b4ab...dd38fdc4f288f0ae24010ca770e6a1b97949274b
--
View it on GitLab: https://code.videolan.org/videolan/vlc/-/compare/e6bb48cc1568fdebabba810718c8c235abe0b4ab...dd38fdc4f288f0ae24010ca770e6a1b97949274b
You're receiving this email because of your account on code.videolan.org.
More information about the vlc-commits
mailing list