[vlc-devel] [PATCH 17/19] deinterlace: finish up SSE2 enhancement
Lyndon Brown
jnqnfe at gmail.com
Thu Sep 24 21:42:08 CEST 2020
From: Lyndon Brown <jnqnfe at gmail.com>
Date: Sat, 9 Mar 2019 06:45:22 +0000
Subject: deinterlace: finish up SSE2 enhancement
In a previous patch, bits of deinterlace that were MMX/MMXEXT accelerated
only were upgraded to SSE2 in preparation for a complete purge of
MMX/MMXEXT, however this upgrade was minimal, not going as far as actually
making proper use of the wider SSE2 registers; todo notes were left
with regard to this.
This commit addresses those todo notes.
With algo-x, the todos were simply removed; in that case using more of the
SSE2 registers would involve moving from the 8x8 processing model to a
16x16/8x16 one, which would then change the results of processing, per how
it determines which change to apply to each 8x8 block on the content of
each 8x8 block.
diff --git a/modules/video_filter/deinterlace/algo_phosphor.c b/modules/video_filter/deinterlace/algo_phosphor.c
index 2223f54e8e..e1ff0178ad 100644
--- a/modules/video_filter/deinterlace/algo_phosphor.c
+++ b/modules/video_filter/deinterlace/algo_phosphor.c
@@ -147,8 +147,6 @@ static void DarkenField( picture_t *p_dst,
} /* if process_chroma */
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static void DarkenFieldSSE( picture_t *p_dst,
@@ -159,10 +157,8 @@ static void DarkenFieldSSE( picture_t *p_dst,
assert( i_field == 0 || i_field == 1 );
assert( i_strength >= 1 && i_strength <= 3 );
- uint64_t i_strength_u64 = i_strength; /* needs to know number of bits */
const uint8_t remove_high_u8 = 0xFF >> i_strength;
- const uint64_t remove_high_u64 = remove_high_u8 *
- INT64_C(0x0101010101010101);
+ const uint32_t remove_high_u32 = remove_high_u8 * 0x01010101;
int i_plane = Y_PLANE;
uint8_t *p_out, *p_out_end;
@@ -175,26 +171,27 @@ static void DarkenFieldSSE( picture_t *p_dst,
if( i_field == 1 )
p_out += p_dst->p[i_plane].i_pitch;
- int wm8 = w % 8; /* remainder */
- int w8 = w - wm8; /* part of width that is divisible by 8 */
+ int wm16 = w % 16; /* remainder */
+ int w16 = w - wm16; /* part of width that is divisible by 16 */
for( ; p_out < p_out_end ; p_out += 2*p_dst->p[i_plane].i_pitch )
{
- uint64_t *po = (uint64_t *)p_out;
+ uint8_t *po = p_out;
int x = 0;
__asm__ volatile (
- "movq %0, %%xmm1\n"
- "movq %1, %%xmm2\n"
- :: "m" (i_strength_u64), "m" (remove_high_u64)
+ "movd %0, %%xmm1\n"
+ "movd %1, %%xmm2\n"
+ "pshufd $0, %%xmm2, %%xmm2\n" /* duplicate 32-bits across reg */
+ :: "m" (i_strength), "m" (remove_high_u32)
: "xmm1", "xmm2"
);
- for( ; x < w8; x += 8 )
+ for( ; x < w16; x += 16 )
{
__asm__ volatile (
- "movq %0, %%xmm0\n"
+ "movdqu %0, %%xmm0\n"
"psrlq %%xmm1, %%xmm0\n"
"pand %%xmm2, %%xmm0\n"
- "movq %%xmm0, %0\n"
+ "movdqu %%xmm0, %0\n"
: "=m" (*po) :: "xmm0", "memory"
);
po++;
@@ -210,7 +207,7 @@ static void DarkenFieldSSE( picture_t *p_dst,
The origin (black) is at YUV = (0, 128, 128) in the uint8 format.
The chroma processing is a bit more complicated than luma,
- and needs MMX for vectorization.
+ and needs SSE2 for vectorization.
*/
if( process_chroma )
{
@@ -219,8 +216,8 @@ static void DarkenFieldSSE( picture_t *p_dst,
i_plane++ )
{
w = p_dst->p[i_plane].i_visible_pitch;
- wm8 = w % 8; /* remainder */
- w8 = w - wm8; /* part of width that is divisible by 8 */
+ wm16 = w % 16; /* remainder */
+ w16 = w - wm16; /* part of width that is divisible by 16 */
p_out = p_dst->p[i_plane].p_pixels;
p_out_end = p_out + p_dst->p[i_plane].i_pitch
@@ -234,25 +231,25 @@ static void DarkenFieldSSE( picture_t *p_dst,
{
int x = 0;
- /* See also easy-to-read C version below. */
- const uint64_t b128 = 0x8080808080808080ULL;
-
__asm__ volatile (
- "movq %0, %%xmm5\n"
- "movq %1, %%xmm6\n"
- "movq %2, %%xmm7\n"
- :: "m" (b128), "m" (i_strength_u64), "m" (remove_high_u64)
- : "xmm5", "xmm6", "xmm7"
+ "mov $0x80808080, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n" /* 128 pattern */
+ "movd %0, %%xmm6\n"
+ "movd %1, %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n" /* duplicate 32-bits across reg */
+ :: "m" (i_strength), "m" (remove_high_u32)
+ : "eax", "xmm5", "xmm6", "xmm7"
);
- uint64_t *po8 = (uint64_t *)p_out;
- for( ; x < w8; x += 8 )
+ uint8_t *po16 = p_out;
+ for( ; x < w16; x += 16 )
{
__asm__ volatile (
- "movq %0, %%xmm0\n"
+ "movdqu %0, %%xmm0\n"
- "movq %%xmm5, %%xmm2\n" /* 128 */
- "movq %%xmm0, %%xmm1\n" /* copy of data */
+ "movdqa %%xmm5, %%xmm2\n" /* 128 */
+ "movdqa %%xmm0, %%xmm1\n" /* copy of data */
"psubusb %%xmm2, %%xmm1\n" /* xmm1 = max(data - 128, 0) */
"psubusb %%xmm0, %%xmm2\n" /* xmm2 = max(128 - data, 0) */
@@ -266,11 +263,11 @@ static void DarkenFieldSSE( picture_t *p_dst,
"psubb %%xmm2, %%xmm1\n"
"paddb %%xmm5, %%xmm1\n"
- "movq %%xmm1, %0\n"
+ "movdqu %%xmm1, %0\n"
- : "=m" (*po8) :: "xmm0", "xmm1", "xmm2", "memory"
+ : "=m" (*po16) :: "xmm0", "xmm1", "xmm2", "memory"
);
- po8++;
+ po16++;
}
/* C version - handle the width remainder */
diff --git a/modules/video_filter/deinterlace/algo_x.c b/modules/video_filter/deinterlace/algo_x.c
index 09cbbc0acd..6b6e674070 100644
--- a/modules/video_filter/deinterlace/algo_x.c
+++ b/modules/video_filter/deinterlace/algo_x.c
@@ -73,9 +73,6 @@ static inline int XDeint8x8DetectC( uint8_t *src, int i_src )
return fc < 1 ? false : true;
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. Would that require
- migration to a 16x16 processing model though? */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static inline int XDeint8x8DetectSSE( uint8_t *src, int i_src )
@@ -178,9 +175,6 @@ static inline void XDeint8x8MergeC( uint8_t *dst, int i_dst,
}
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. Would that require
- migration to a 16x16 processing model though? */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static inline void XDeint8x8MergeSSE( uint8_t *dst, int i_dst,
@@ -257,9 +251,6 @@ static inline void XDeint8x8FieldEC( uint8_t *dst, int i_dst,
}
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. Would that require
- migration to a 16x16 processing model though? */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static inline void XDeint8x8FieldESSE( uint8_t *dst, int i_dst,
@@ -309,7 +300,7 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
for( x = 0; x < 8; x++ )
{
uint8_t *src2 = &src[2*i_src];
- /* I use 8 pixels just to match the MMX version, but it's overkill
+ /* I use 8 pixels just to match the SIMD version, but it's overkill
* 5 would be enough (less isn't good) */
const int c0 = abs(src[x-4]-src2[x-2]) + abs(src[x-3]-src2[x-1]) +
abs(src[x-2]-src2[x+0]) + abs(src[x-1]-src2[x+1]) +
@@ -339,9 +330,6 @@ static inline void XDeint8x8FieldC( uint8_t *dst, int i_dst,
}
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. Would that require
- migration to a 16x16 processing model though? */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static inline void XDeint8x8FieldSSE( uint8_t *dst, int i_dst,
diff --git a/modules/video_filter/deinterlace/algo_x.h b/modules/video_filter/deinterlace/algo_x.h
index dd70cbd732..1667d663ba 100644
--- a/modules/video_filter/deinterlace/algo_x.h
+++ b/modules/video_filter/deinterlace/algo_x.h
@@ -33,13 +33,13 @@ struct picture_t;
/**
* Interpolating deinterlace filter "X".
*
- * The algorithm works on a 8x8 block basic, it copies the top field
+ * The algorithm works on a 8x8 block basis; It copies the top field
* and applies a process to recreate the bottom field.
*
* If a 8x8 block is classified as :
* - progressive: it applies a small blend (1,6,1)
* - interlaced:
- * * in the MMX version: we do a ME between the 2 fields, if there is a
+ * * in the SIMD version: we do a ME between the 2 fields, if there is a
* good match we use MC to recreate the bottom field (with a small
* blend (1,6,1) )
* * otherwise: it recreates the bottom field by an edge oriented
diff --git a/modules/video_filter/deinterlace/helpers.c b/modules/video_filter/deinterlace/helpers.c
index 759164c32c..1b02dd8d04 100644
--- a/modules/video_filter/deinterlace/helpers.c
+++ b/modules/video_filter/deinterlace/helpers.c
@@ -169,8 +169,7 @@ static int TestForMotionInBlock( uint8_t *p_pix_p, uint8_t *p_pix_c,
return (i_motion >= 8);
}
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. */
+/* Note: This examines an 8x8 block just like the C equivalent */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static int TestForMotionInBlockSSE( uint8_t *p_pix_p, uint8_t *p_pix_c,
@@ -467,8 +466,6 @@ int EstimateNumBlocksWithMotion( const picture_t* p_prev,
/* Threshold (value from Transcode 1.1.5) */
#define T 100
-/* TODO: This is a simple conversion of MMX to using SSE registers,
- without making use of their expanded width. */
#ifdef CAN_COMPILE_SSE
VLC_SSE
static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
@@ -494,8 +491,8 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
const int i_lasty = p_pic_top->p[i_plane].i_visible_lines-1;
const int w = FFMIN( p_pic_top->p[i_plane].i_visible_pitch,
p_pic_bot->p[i_plane].i_visible_pitch );
- const int wm8 = w % 8; /* remainder */
- const int w8 = w - wm8; /* part of width that is divisible by 8 */
+ const int wm16 = w % 16; /* remainder */
+ const int w16 = w - wm16; /* part of width that is divisible by 16 */
/* Current line / neighbouring lines picture pointers */
const picture_t *cur = p_pic_bot;
@@ -521,17 +518,20 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
# of pixels < (2^32)/255
Note: calculates score * 255
*/
- const uint64_t b128 = 0x8080808080808080ULL;
- const uint8_t bT[8] = { T, T, T, T, T, T, T, T };
+ const uint8_t bT[16] = { T, T, T, T, T, T, T, T,
+ T, T, T, T, T, T, T, T };
- for( ; x < w8; x += 8 )
+ for( ; x < w16; x += 16 )
{
__asm__ volatile (
- "movq %0, %%xmm0\n"
- "movq %1, %%xmm1\n"
- "movq %2, %%xmm2\n"
+ "movdqu %0, %%xmm0\n"
+ "movdqu %1, %%xmm1\n"
+ "movdqu %2, %%xmm2\n"
+
+ "mov $0x80808080, %%eax\n"
+ "movd %%eax, %%xmm3\n"
+ "pshufd $0, %%xmm3, %%xmm3\n" /* 128 pattern */
- "movq %3, %%xmm3\n"
"psubb %%xmm3, %%xmm0\n"
"psubb %%xmm3, %%xmm1\n"
"psubb %%xmm3, %%xmm2\n"
@@ -552,7 +552,7 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
"pmulhw %%xmm3, %%xmm4\n"
"pmulhw %%xmm5, %%xmm6\n"
- "movq %4, %%xmm0\n"
+ "movq %3, %%xmm0\n"
"pxor %%xmm1, %%xmm1\n"
"packsswb %%xmm4, %%xmm6\n"
@@ -560,16 +560,14 @@ static int CalculateInterlaceScoreSSE( const picture_t* p_pic_top,
"psadbw %%xmm1, %%xmm6\n"
"paddd %%xmm6, %%xmm7\n"
- :: "m" (*((int64_t*)p_c)),
- "m" (*((int64_t*)p_p)),
- "m" (*((int64_t*)p_n)),
- "m" (b128), "m" (bT)
- : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ :: "m" (*p_c), "m" (*p_p), "m" (*p_n), "m" (bT)
+ : "eax", "xmm0", "xmm1", "xmm2", "xmm3",
+ "xmm4", "xmm5", "xmm6", "xmm7"
);
- p_c += 8;
- p_p += 8;
- p_n += 8;
+ p_c += 16;
+ p_p += 16;
+ p_n += 16;
}
for( ; x < w; ++x )
More information about the vlc-devel
mailing list