[vlc-devel] [PATCH 8/19] deinterlace: convert MMXEXT only accelerations to SSE2
Lyndon Brown
jnqnfe at gmail.com
Thu Sep 24 21:38:10 CEST 2020
From: Lyndon Brown <jnqnfe at gmail.com>
Date: Sat, 26 Jan 2019 08:51:36 +0000
Subject: deinterlace: convert MMXEXT only accelerations to SSE2
(prep work for purging MMX/MMXEXT)
the code changed here has no greater acceleration than MMX/MMXEXT, so to
prepare for the MMX/MMXEXT purge, convert this to SSE2
note, this is a basic conversion only, *using* SSE2 vector registers (Xmm)
but not actually taking advantage of their greater width; that may
require significant revision to the code and thus is left for a separate
commit (and/or person) to possibly tackle later. note the following
benefits, vs. leaving as it was:
- switching to SSE registers avoids the MMX<->FP register clash
- thus it avoids the need for issuing `emms` instructions
- with a little more work, could take full advantage of these wider registers
- allows us to proceed with the purge of all old MMX/MMXEXT code
also, I inlined the asm, removing dependency on the abstraction macro set
diff --git a/modules/video_filter/deinterlace/algo_phosphor.c b/modules/video_filter/deinterlace/algo_phosphor.c
index 289eed783b..2223f54e8e 100644
--- a/modules/video_filter/deinterlace/algo_phosphor.c
+++ b/modules/video_filter/deinterlace/algo_phosphor.c
@@ -24,8 +24,7 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
+#ifdef CAN_COMPILE_SSE
# include <stdalign.h>
#endif
@@ -87,7 +86,7 @@ static void DarkenField( picture_t *p_dst,
For luma, the operation is just a shift + bitwise AND, so we vectorize
even in the C version.
- There is an MMX version too, because it performs about twice faster.
+ There are SIMD versions too, which perform significantly faster.
*/
int i_plane = Y_PLANE;
uint8_t *p_out, *p_out_end;
@@ -120,7 +119,7 @@ static void DarkenField( picture_t *p_dst,
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
The chroma processing is a bit more complicated than luma,
- and needs MMX for vectorization.
+ and needs SIMD for vectorization.
*/
if( process_chroma )
{
@@ -148,9 +147,11 @@ static void DarkenField( picture_t *p_dst,
} /* if process_chroma */
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static void DarkenFieldMMX( picture_t *p_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static void DarkenFieldSSE( picture_t *p_dst,
const int i_field, const int i_strength,
bool process_chroma )
{
@@ -181,16 +182,22 @@ static void DarkenFieldMMX( picture_t *p_dst,
uint64_t *po = (uint64_t *)p_out;
int x = 0;
- movq_m2r( i_strength_u64, mm1 );
- movq_m2r( remove_high_u64, mm2 );
+ __asm__ volatile (
+ "movq %0, %%xmm1\n"
+ "movq %1, %%xmm2\n"
+ :: "m" (i_strength_u64), "m" (remove_high_u64)
+ : "xmm1", "xmm2"
+ );
for( ; x < w8; x += 8 )
{
- movq_m2r( (*po), mm0 );
-
- psrlq_r2r( mm1, mm0 );
- pand_r2r( mm2, mm0 );
-
- movq_r2m( mm0, (*po++) );
+ __asm__ volatile (
+ "movq %0, %%xmm0\n"
+ "psrlq %%xmm1, %%xmm0\n"
+ "pand %%xmm2, %%xmm0\n"
+ "movq %%xmm0, %0\n"
+ : "=m" (*po) :: "xmm0", "memory"
+ );
+ po++;
}
/* handle the width remainder */
@@ -228,35 +235,42 @@ static void DarkenFieldMMX( picture_t *p_dst,
int x = 0;
/* See also easy-to-read C version below. */
- static alignas (8) const mmx_t b128 = {
- .uq = 0x8080808080808080ULL
- };
+ const uint64_t b128 = 0x8080808080808080ULL;
- movq_m2r( b128, mm5 );
- movq_m2r( i_strength_u64, mm6 );
- movq_m2r( remove_high_u64, mm7 );
+ __asm__ volatile (
+ "movq %0, %%xmm5\n"
+ "movq %1, %%xmm6\n"
+ "movq %2, %%xmm7\n"
+ :: "m" (b128), "m" (i_strength_u64), "m" (remove_high_u64)
+ : "xmm5", "xmm6", "xmm7"
+ );
uint64_t *po8 = (uint64_t *)p_out;
for( ; x < w8; x += 8 )
{
- movq_m2r( (*po8), mm0 );
+ __asm__ volatile (
+ "movq %0, %%xmm0\n"
- movq_r2r( mm5, mm2 ); /* 128 */
- movq_r2r( mm0, mm1 ); /* copy of data */
- psubusb_r2r( mm2, mm1 ); /* mm1 = max(data - 128, 0) */
- psubusb_r2r( mm0, mm2 ); /* mm2 = max(128 - data, 0) */
+ "movq %%xmm5, %%xmm2\n" /* 128 */
+ "movq %%xmm0, %%xmm1\n" /* copy of data */
+ "psubusb %%xmm2, %%xmm1\n" /* xmm1 = max(data - 128, 0) */
+ "psubusb %%xmm0, %%xmm2\n" /* xmm2 = max(128 - data, 0) */
- /* >> i_strength */
- psrlq_r2r( mm6, mm1 );
- psrlq_r2r( mm6, mm2 );
- pand_r2r( mm7, mm1 );
- pand_r2r( mm7, mm2 );
+ /* >> i_strength */
+ "psrlq %%xmm6, %%xmm1\n"
+ "psrlq %%xmm6, %%xmm2\n"
+ "pand %%xmm7, %%xmm1\n"
+ "pand %%xmm7, %%xmm2\n"
- /* collect results from pos./neg. parts */
- psubb_r2r( mm2, mm1 );
- paddb_r2r( mm5, mm1 );
+ /* collect results from pos./neg. parts */
+ "psubb %%xmm2, %%xmm1\n"
+ "paddb %%xmm5, %%xmm1\n"
- movq_r2m( mm1, (*po8++) );
+ "movq %%xmm1, %0\n"
+
+ : "=m" (*po8) :: "xmm0", "xmm1", "xmm2", "memory"
+ );
+ po8++;
}
/* C version - handle the width remainder */
@@ -266,8 +280,6 @@ static void DarkenFieldMMX( picture_t *p_dst,
} /* for p_out... */
} /* for i_plane... */
} /* if process_chroma */
-
- emms();
}
#endif
@@ -357,9 +369,9 @@ int RenderPhosphor( filter_t *p_filter,
*/
if( p_sys->phosphor.i_dimmer_strength > 0 )
{
-#ifdef CAN_COMPILE_MMXEXT
- if( vlc_CPU_MMXEXT() )
- DarkenFieldMMX( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
+#ifdef CAN_COMPILE_SSE
+ if( vlc_CPU_SSE2() )
+ DarkenFieldSSE( p_dst, !i_field, p_sys->phosphor.i_dimmer_strength,
p_sys->chroma->p[1].h.num == p_sys->chroma->p[1].h.den &&
p_sys->chroma->p[2].h.num == p_sys->chroma->p[2].h.den );
else
diff --git a/modules/video_filter/deinterlace/algo_x.c b/modules/video_filter/deinterlace/algo_x.c
index 411351d1cd..09cbbc0acd 100644
--- a/modules/video_filter/deinterlace/algo_x.c
+++ b/modules/video_filter/deinterlace/algo_x.c
@@ -24,10 +24,6 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
-#endif
-
#include <stdint.h>
#include <vlc_common.h>
@@ -76,9 +72,13 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
return fc < 1 ? false : true;
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
+
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. Would that require
+ migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline int XDeint8x8DetectSSE( uint8_t *src, int i_src )
{
int y, x;
@@ -87,51 +87,66 @@ static inline int XDeint8x8DetectMMXEXT( uint8_t *src, int i_src )
/* Detect interlacing */
fc = 0;
- pxor_r2r( mm7, mm7 );
+ __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
for( y = 0; y < 9; y += 2 )
{
ff = fr = 0;
- pxor_r2r( mm5, mm5 );
- pxor_r2r( mm6, mm6 );
+ __asm__ volatile (
+ "pxor %%xmm5, %%xmm5\n"
+ "pxor %%xmm6, %%xmm6\n"
+ ::: "xmm5", "xmm6"
+ );
for( x = 0; x < 8; x+=4 )
{
- movd_m2r( src[ x], mm0 );
- movd_m2r( src[1*i_src+x], mm1 );
- movd_m2r( src[2*i_src+x], mm2 );
- movd_m2r( src[3*i_src+x], mm3 );
-
- punpcklbw_r2r( mm7, mm0 );
- punpcklbw_r2r( mm7, mm1 );
- punpcklbw_r2r( mm7, mm2 );
- punpcklbw_r2r( mm7, mm3 );
-
- movq_r2r( mm0, mm4 );
-
- psubw_r2r( mm1, mm0 );
- psubw_r2r( mm2, mm4 );
-
- psubw_r2r( mm1, mm2 );
- psubw_r2r( mm1, mm3 );
-
- pmaddwd_r2r( mm0, mm0 );
- pmaddwd_r2r( mm4, mm4 );
- pmaddwd_r2r( mm2, mm2 );
- pmaddwd_r2r( mm3, mm3 );
- paddd_r2r( mm0, mm2 );
- paddd_r2r( mm4, mm3 );
- paddd_r2r( mm2, mm5 );
- paddd_r2r( mm3, mm6 );
+ __asm__ volatile (
+ "movd %0, %%xmm0\n"
+ "movd %1, %%xmm1\n"
+ "movd %2, %%xmm2\n"
+ "movd %3, %%xmm3\n"
+
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm1\n"
+ "punpcklbw %%xmm7, %%xmm2\n"
+ "punpcklbw %%xmm7, %%xmm3\n"
+
+ "movq %%xmm0, %%xmm4\n"
+
+ "psubw %%xmm2, %%xmm4\n"
+ "psubw %%xmm1, %%xmm0\n"
+ "psubw %%xmm1, %%xmm2\n"
+ "psubw %%xmm1, %%xmm3\n"
+
+ "pmaddwd %%xmm0, %%xmm0\n"
+ "pmaddwd %%xmm2, %%xmm2\n"
+ "pmaddwd %%xmm3, %%xmm3\n"
+ "pmaddwd %%xmm4, %%xmm4\n"
+ "paddd %%xmm0, %%xmm2\n"
+ "paddd %%xmm4, %%xmm3\n"
+ "paddd %%xmm2, %%xmm5\n"
+ "paddd %%xmm3, %%xmm6\n"
+
+ :: "m" (src[ x]),
+ "m" (src[1*i_src+x]),
+ "m" (src[2*i_src+x]),
+ "m" (src[3*i_src+x])
+ : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
+ );
}
- movq_r2r( mm5, mm0 );
- psrlq_i2r( 32, mm0 );
- paddd_r2r( mm0, mm5 );
- movd_r2m( mm5, fr );
+ __asm__ volatile (
+ "movq %%xmm5, %%xmm0\n"
+ "psrlq $32, %%xmm0\n"
+ "paddd %%xmm0, %%xmm5\n"
+ "movd %%xmm5, %0\n"
- movq_r2r( mm6, mm0 );
- psrlq_i2r( 32, mm0 );
- paddd_r2r( mm0, mm6 );
- movd_r2m( mm6, ff );
+ "movq %%xmm6, %%xmm0\n"
+ "psrlq $32, %%xmm0\n"
+ "paddd %%xmm0, %%xmm6\n"
+ "movd %%xmm6, %1\n"
+
+ : "=m" (fr), "=m" (ff)
+ :: "xmm0", "xmm5", "xmm6", "memory"
+ );
if( ff < 6*fr/8 && fr > 32 )
fc++;
@@ -163,9 +178,12 @@ static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. Would that require
+ migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8MergeSSE( uint8_t *dst, int i_dst,
uint8_t *src1, int i_src1,
uint8_t *src2, int i_src2 )
{
@@ -173,37 +191,47 @@ static inline void XDeint8x8MergeMMXEXT( uint8_t *dst, int i_dst,
int y, x;
/* Progressive */
- pxor_r2r( mm7, mm7 );
+ __asm__ volatile (
+ "pxor %%xmm7, %%xmm7\n"
+ "movq %0, %%xmm6\n"
+ :: "m" (m_4) : "xmm6", "xmm7"
+ );
for( y = 0; y < 8; y += 2 )
{
for( x = 0; x < 8; x +=4 )
{
- movd_m2r( src1[x], mm0 );
- movd_r2m( mm0, dst[x] );
-
- movd_m2r( src2[x], mm1 );
- movd_m2r( src1[i_src1+x], mm2 );
-
- punpcklbw_r2r( mm7, mm0 );
- punpcklbw_r2r( mm7, mm1 );
- punpcklbw_r2r( mm7, mm2 );
- paddw_r2r( mm1, mm1 );
- movq_r2r( mm1, mm3 );
- paddw_r2r( mm3, mm3 );
- paddw_r2r( mm2, mm0 );
- paddw_r2r( mm3, mm1 );
- paddw_m2r( m_4, mm1 );
- paddw_r2r( mm1, mm0 );
- psraw_i2r( 3, mm0 );
- packuswb_r2r( mm7, mm0 );
- movd_r2m( mm0, dst[i_dst+x] );
+ __asm__ volatile (
+ "movd %2, %%xmm0\n"
+ "movd %%xmm0, %0\n"
+
+ "movd %3, %%xmm1\n"
+ "movd %4, %%xmm2\n"
+
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm1\n"
+ "punpcklbw %%xmm7, %%xmm2\n"
+ "paddw %%xmm1, %%xmm1\n"
+ "movq %%xmm1, %%xmm3\n"
+ "paddw %%xmm3, %%xmm3\n"
+ "paddw %%xmm2, %%xmm0\n"
+ "paddw %%xmm3, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ "paddw %%xmm1, %%xmm0\n"
+ "psraw $3, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "movd %%xmm0, %1\n"
+
+ : "=m" (dst[x]), "=m" (dst[i_dst+x])
+ : "m" (src1[x]), "m" (src2[x]), "m" (src1[i_src1+x])
+ : "xmm0", "xmm1", "xmm2", "xmm3",
+ "memory"
+ );
}
dst += 2*i_dst;
src1 += i_src1;
src2 += i_src2;
}
}
-
#endif
/* XDeint8x8FieldE: Stupid deinterlacing (1,0,1) for block that miss a
@@ -229,9 +257,12 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. Would that require
+ migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8FieldESSE( uint8_t *dst, int i_dst,
uint8_t *src, int i_src )
{
int y;
@@ -239,14 +270,21 @@ static inline void XDeint8x8FieldEMMXEXT( uint8_t *dst, int i_dst,
/* Interlaced */
for( y = 0; y < 8; y += 2 )
{
- movq_m2r( src[0], mm0 );
- movq_r2m( mm0, dst[0] );
+ __asm__ volatile (
+ "movq %1, %%xmm0\n"
+ "movq %%xmm0, %0\n"
+ : "=m" (dst[0]) : "m" (src[0])
+ : "xmm0", "memory"
+ );
dst += i_dst;
- movq_m2r( src[2*i_src], mm1 );
- pavgb_r2r( mm1, mm0 );
-
- movq_r2m( mm0, dst[0] );
+ __asm__ volatile (
+ "movq %1, %%xmm1\n"
+ "pavgb %%xmm1, %%xmm0\n"
+ "movq %%xmm0, %0\n"
+ : "=m" (dst[0]) : "m" (src[2*i_src])
+ : "xmm0", "xmm1", "memory"
+ );
dst += 1*i_dst;
src += 2*i_src;
@@ -301,9 +339,12 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
}
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. Would that require
+ migration to a 16x16 processing model though? */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeint8x8FieldSSE( uint8_t *dst, int i_dst,
uint8_t *src, int i_src )
{
int y, x;
@@ -319,17 +360,24 @@ static inline void XDeint8x8FieldMMXEXT( uint8_t *dst, int i_dst,
uint8_t *src2 = &src[2*i_src];
int32_t c0, c1, c2;
- movq_m2r( src[x-2], mm0 );
- movq_m2r( src[x-3], mm1 );
- movq_m2r( src[x-4], mm2 );
-
- psadbw_m2r( src2[x-4], mm0 );
- psadbw_m2r( src2[x-3], mm1 );
- psadbw_m2r( src2[x-2], mm2 );
-
- movd_r2m( mm0, c2 );
- movd_r2m( mm1, c1 );
- movd_r2m( mm2, c0 );
+ __asm__ volatile (
+ "movq %3, %%xmm0\n"
+ "movq %4, %%xmm1\n"
+ "movq %5, %%xmm2\n"
+ "movq %6, %%xmm3\n"
+ "movq %7, %%xmm4\n"
+ "movq %8, %%xmm5\n"
+ "psadbw %%xmm3, %%xmm0\n"
+ "psadbw %%xmm4, %%xmm1\n"
+ "psadbw %%xmm5, %%xmm2\n"
+ "movd %%xmm0, %2\n"
+ "movd %%xmm1, %1\n"
+ "movd %%xmm2, %0\n"
+ : "=m" (c0), "=m" (c1), "=m" (c2)
+ : "m" (src[x-2]), "m" (src[x-3]), "m" (src[x-4]),
+ "m" (src2[x-4]), "m" (src2[x-3]), "m" (src2[x-2])
+ : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory"
+ );
if( c0 < c1 && c1 <= c2 )
dst[x] = (src[x-1] + src2[x+1]) >> 1;
@@ -472,9 +520,9 @@ static inline void XDeintBand8x8C( uint8_t *dst, int i_dst,
XDeintNxN( dst, i_dst, src, i_src, i_modx, 8 );
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static inline void XDeintBand8x8SSE( uint8_t *dst, int i_dst,
uint8_t *src, int i_src,
const int i_mbx, int i_modx )
{
@@ -484,16 +532,16 @@ static inline void XDeintBand8x8MMXEXT( uint8_t *dst, int i_dst,
for( x = 0; x < i_mbx; x++ )
{
int s;
- if( ( s = XDeint8x8DetectMMXEXT( src, i_src ) ) )
+ if( ( s = XDeint8x8DetectSSE( src, i_src ) ) )
{
if( x == 0 || x == i_mbx - 1 )
- XDeint8x8FieldEMMXEXT( dst, i_dst, src, i_src );
+ XDeint8x8FieldESSE( dst, i_dst, src, i_src );
else
- XDeint8x8FieldMMXEXT( dst, i_dst, src, i_src );
+ XDeint8x8FieldSSE( dst, i_dst, src, i_src );
}
else
{
- XDeint8x8MergeMMXEXT( dst, i_dst,
+ XDeint8x8MergeSSE( dst, i_dst,
&src[0*i_src], 2*i_src,
&src[1*i_src], 2*i_src );
}
@@ -515,8 +563,8 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
{
VLC_UNUSED(p_filter);
int i_plane;
-#if defined (CAN_COMPILE_MMXEXT)
- const bool mmxext = vlc_CPU_MMXEXT();
+#if defined (CAN_COMPILE_SSE)
+ const bool sse = vlc_CPU_SSE2();
#endif
/* Copy image and skip lines */
@@ -538,9 +586,9 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
uint8_t *dst = &p_outpic->p[i_plane].p_pixels[8*y*i_dst];
uint8_t *src = &p_pic->p[i_plane].p_pixels[8*y*i_src];
-#ifdef CAN_COMPILE_MMXEXT
- if( mmxext )
- XDeintBand8x8MMXEXT( dst, i_dst, src, i_src, i_mbx, i_modx );
+#ifdef CAN_COMPILE_SSE
+ if( sse )
+ XDeintBand8x8SSE( dst, i_dst, src, i_src, i_mbx, i_modx );
else
#endif
XDeintBand8x8C( dst, i_dst, src, i_src, i_mbx, i_modx );
@@ -565,9 +613,5 @@ int RenderX( filter_t *p_filter, picture_t *p_outpic, picture_t *p_pic )
}
}
-#ifdef CAN_COMPILE_MMXEXT
- if( mmxext )
- emms();
-#endif
return VLC_SUCCESS;
}
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index a882a408c1..759164c32c 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -24,8 +24,7 @@
# include "config.h"
#endif
-#ifdef CAN_COMPILE_MMXEXT
-# include "mmx.h"
+#ifdef CAN_COMPILE_SSE
# include <stdalign.h>
#endif
@@ -107,9 +106,6 @@ static void FieldFromPlane( plane_t *p_dst, const plane_t *p_src, int i_field )
* For interpretation of pi_top and pi_bot, it is assumed that the block
* starts on an even-numbered line (belonging to the top field).
*
- * The b_mmx parameter avoids the need to call vlc_CPU() separately
- * for each block.
- *
* @param[in] p_pix_p Base pointer to the block in previous picture
* @param[in] p_pix_c Base pointer to the same block in current picture
* @param i_pitch_prev i_pitch of previous picture
@@ -173,9 +169,11 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
return (i_motion >= 8);
}
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static int TestForMotionInBlockSSE( uint8_t *p_pix_p, uint8_t *p_pix_c,
int i_pitch_prev, int i_pitch_curr,
int* pi_top, int* pi_bot )
{
@@ -183,63 +181,81 @@ static int TestForMotionInBlockMMX( uint8_t *p_pix_p, uint8_t *p_pix_c,
int32_t i_top_motion = 0;
int32_t i_bot_motion = 0;
- static alignas (8) const mmx_t bT = { .ub = { T, T, T, T, T, T, T, T } };
- pxor_r2r( mm6, mm6 ); /* zero, used in psadbw */
- movq_m2r( bT, mm5 );
+ const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
+ __asm__ volatile (
+ "pxor %%xmm6, %%xmm6\n" /* zero, used in psadbw */
+ "movq %0, %%xmm5\n"
+
+ "pxor %%xmm3, %%xmm3\n" /* score (top field) */
+ "pxor %%xmm4, %%xmm4\n" /* score (bottom field) */
- pxor_r2r( mm3, mm3 ); /* score (top field) */
- pxor_r2r( mm4, mm4 ); /* score (bottom field) */
+ :: "m" (bT) : "xmm3", "xmm4", "xmm5", "xmm6"
+ );
for( int y = 0; y < 8; y+=2 )
{
/* top field */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
+ __asm__ volatile (
+ "movq %0, %%xmm0\n"
+ "movq %1, %%xmm1\n"
+ "movq %%xmm0, %%xmm2\n"
+ "psubusb %%xmm1, %%xmm2\n"
+ "psubusb %%xmm0, %%xmm1\n"
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
+ "pcmpgtb %%xmm5, %%xmm2\n"
+ "pcmpgtb %%xmm5, %%xmm1\n"
+ "psadbw %%xmm6, %%xmm2\n"
+ "psadbw %%xmm6, %%xmm1\n"
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm3 ); /* add to top field score */
+ "paddd %%xmm2, %%xmm1\n"
+ "paddd %%xmm1, %%xmm3\n" /* add to top field score */
+
+ :: "m" (*((uint64_t*)p_pix_c)), "m" (*((uint64_t*)p_pix_p))
+ : "xmm0", "xmm1", "xmm2", "xmm3"
+ );
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
/* bottom field - handling identical to top field, except... */
- movq_m2r( *((uint64_t*)p_pix_c), mm0 );
- movq_m2r( *((uint64_t*)p_pix_p), mm1 );
- movq_r2r( mm0, mm2 );
- psubusb_r2r( mm1, mm2 );
- psubusb_r2r( mm0, mm1 );
-
- pcmpgtb_r2r( mm5, mm2 );
- pcmpgtb_r2r( mm5, mm1 );
- psadbw_r2r( mm6, mm2 );
- psadbw_r2r( mm6, mm1 );
-
- paddd_r2r( mm2, mm1 );
- paddd_r2r( mm1, mm4 ); /* ...here we add to bottom field score */
+ __asm__ volatile (
+ /* top field */
+ "movq %0, %%xmm0\n"
+ "movq %1, %%xmm1\n"
+ "movq %%xmm0, %%xmm2\n"
+ "psubusb %%xmm1, %%xmm2\n"
+ "psubusb %%xmm0, %%xmm1\n"
+
+ "pcmpgtb %%xmm5, %%xmm2\n"
+ "pcmpgtb %%xmm5, %%xmm1\n"
+ "psadbw %%xmm6, %%xmm2\n"
+ "psadbw %%xmm6, %%xmm1\n"
+
+ "paddd %%xmm2, %%xmm1\n"
+ "paddd %%xmm1, %%xmm4\n" /* ...here we add to bottom field score */
+
+ :: "m" (*((uint64_t*)p_pix_c)), "m" (*((uint64_t*)p_pix_p))
+ : "xmm0", "xmm1", "xmm2", "xmm4"
+ );
p_pix_c += i_pitch_curr;
p_pix_p += i_pitch_prev;
}
- movq_r2r( mm3, mm7 ); /* score (total) */
- paddd_r2r( mm4, mm7 );
- movd_r2m( mm3, i_top_motion );
- movd_r2m( mm4, i_bot_motion );
- movd_r2m( mm7, i_motion );
+ __asm__ volatile (
+ "movq %%xmm3, %%xmm7\n" /* score (total) */
+ "paddd %%xmm4, %%xmm7\n"
+ "movd %%xmm3, %0\n"
+ "movd %%xmm4, %1\n"
+ "movd %%xmm7, %2\n"
+
+ : "=m" (i_top_motion), "=m" (i_bot_motion), "=m" (i_motion)
+ :: "xmm7", "memory"
+ );
/* The loop counts actual score * 255. */
i_top_motion /= 255;
i_bot_motion /= 255;
i_motion /= 255;
- emms();
-
(*pi_top) = ( i_top_motion >= 8 );
(*pi_bot) = ( i_bot_motion >= 8 );
return (i_motion >= 8);
@@ -396,10 +412,10 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
int (*motion_in_block)(uint8_t *, uint8_t *, int , int, int *, int *) =
TestForMotionInBlock;
- /* We must tell our inline helper whether to use MMX acceleration. */
-#ifdef CAN_COMPILE_MMXEXT
- if (vlc_CPU_MMXEXT())
- motion_in_block = TestForMotionInBlockMMX;
+ /* We must tell our inline helper whether to use SSE2 acceleration. */
+#ifdef CAN_COMPILE_SSE
+ if (vlc_CPU_SSE2())
+ motion_in_block = TestForMotionInBlockSSE;
#endif
int i_score = 0;
@@ -451,19 +467,22 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
/* Threshold (value from Transcode 1.1.5) */
#define T 100
-#ifdef CAN_COMPILE_MMXEXT
-VLC_MMX
-static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
+/* TODO: This is a simple conversion of MMX to using SSE registers,
+ without making use of their expanded width. */
+#ifdef CAN_COMPILE_SSE
+VLC_SSE
+static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
const picture_t* p_pic_bot )
{
assert( p_pic_top->i_planes == p_pic_bot->i_planes );
- /* Amount of bits must be known for MMX, thus int32_t.
+ /* Amount of bits must be known for SSE, thus int32_t.
Doesn't hurt the C implementation. */
- int32_t i_score_mmx = 0; /* this must be divided by 255 when finished */
- int32_t i_score_c = 0; /* this counts as-is (used for non-MMX parts) */
+ int32_t i_score_sse = 0; /* this must be divided by 255 when finished */
+ int32_t i_score_c = 0; /* this counts as-is (used for non-SSE parts) */
- pxor_r2r( mm7, mm7 ); /* we will keep score in mm7 */
+ /* we will keep score in mm7 */
+ __asm__ volatile ("pxor %%xmm7, %%xmm7" ::: "xmm7");
for( int i_plane = 0 ; i_plane < p_pic_top->i_planes ; ++i_plane )
{
@@ -502,43 +521,51 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
# of pixels < (2^32)/255
Note: calculates score * 255
*/
- static alignas (8) const mmx_t b0 = {
- .uq = 0x0000000000000000ULL };
- static alignas (8) const mmx_t b128 = {
- .uq = 0x8080808080808080ULL };
- static alignas (8) const mmx_t bT = {
- .ub = { T, T, T, T, T, T, T, T } };
+ const uint64_t b128 = 0x8080808080808080ULL;
+ const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
for( ; x < w8; x += 8 )
{
- movq_m2r( *((int64_t*)p_c), mm0 );
- movq_m2r( *((int64_t*)p_p), mm1 );
- movq_m2r( *((int64_t*)p_n), mm2 );
-
- psubb_m2r( b128, mm0 );
- psubb_m2r( b128, mm1 );
- psubb_m2r( b128, mm2 );
-
- psubsb_r2r( mm0, mm1 );
- psubsb_r2r( mm0, mm2 );
-
- pxor_r2r( mm3, mm3 );
- pxor_r2r( mm4, mm4 );
- pxor_r2r( mm5, mm5 );
- pxor_r2r( mm6, mm6 );
-
- punpcklbw_r2r( mm1, mm3 );
- punpcklbw_r2r( mm2, mm4 );
- punpckhbw_r2r( mm1, mm5 );
- punpckhbw_r2r( mm2, mm6 );
-
- pmulhw_r2r( mm3, mm4 );
- pmulhw_r2r( mm5, mm6 );
-
- packsswb_r2r(mm4, mm6);
- pcmpgtb_m2r( bT, mm6 );
- psadbw_m2r( b0, mm6 );
- paddd_r2r( mm6, mm7 );
+ __asm__ volatile (
+ "movq %0, %%xmm0\n"
+ "movq %1, %%xmm1\n"
+ "movq %2, %%xmm2\n"
+
+ "movq %3, %%xmm3\n"
+ "psubb %%xmm3, %%xmm0\n"
+ "psubb %%xmm3, %%xmm1\n"
+ "psubb %%xmm3, %%xmm2\n"
+
+ "psubsb %%xmm0, %%xmm1\n"
+ "psubsb %%xmm0, %%xmm2\n"
+
+ "pxor %%xmm3, %%xmm3\n"
+ "pxor %%xmm4, %%xmm4\n"
+ "pxor %%xmm5, %%xmm5\n"
+ "pxor %%xmm6, %%xmm6\n"
+
+ "punpcklbw %%xmm1, %%xmm3\n"
+ "punpcklbw %%xmm2, %%xmm4\n"
+ "punpckhbw %%xmm1, %%xmm5\n"
+ "punpckhbw %%xmm2, %%xmm6\n"
+
+ "pmulhw %%xmm3, %%xmm4\n"
+ "pmulhw %%xmm5, %%xmm6\n"
+
+ "movq %4, %%xmm0\n"
+ "pxor %%xmm1, %%xmm1\n"
+
+ "packsswb %%xmm4, %%xmm6\n"
+ "pcmpgtb %%xmm0, %%xmm6\n"
+ "psadbw %%xmm1, %%xmm6\n"
+ "paddd %%xmm6, %%xmm7\n"
+
+ :: "m" (*((int64_t*)p_c)),
+ "m" (*((int64_t*)p_p)),
+ "m" (*((int64_t*)p_n)),
+ "m" (b128), "m" (bT)
+ : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ );
p_c += 8;
p_p += 8;
@@ -580,10 +607,9 @@ static int CalculateInterlaceScoreMMX( const picture_t* p_pic_top,
}
}
- movd_r2m( mm7, i_score_mmx );
- emms();
+ __asm__ volatile ("movd %%xmm7, %0\n" : "=m" (i_score_sse) :: "memory");
- return i_score_mmx/255 + i_score_c;
+ return i_score_sse/255 + i_score_c;
}
#endif
@@ -607,9 +633,9 @@ int CalculateInterlaceScore( const picture_t* p_pic_top,
if( p_pic_top->i_planes != p_pic_bot->i_planes )
return -1;
-#ifdef CAN_COMPILE_MMXEXT
- if (vlc_CPU_MMXEXT())
- return CalculateInterlaceScoreMMX( p_pic_top, p_pic_bot );
+#ifdef CAN_COMPILE_SSE
+ if (vlc_CPU_SSE2())
+ return CalculateInterlaceScoreSSE( p_pic_top, p_pic_bot );
#endif
int32_t i_score = 0;
More information about the vlc-devel
mailing list