[vlc-devel] [PATCH 1/2] Sepia video filter cleanup computation formula is changed, slightly different color tone computed directly from YUV data, should be transformed to RGB too, but I unluckily don't have enough time for that right now
gamajun at seznam.cz
gamajun at seznam.cz
Sun Apr 3 23:38:05 CEST 2011
---
modules/video_filter/sepia.c | 301 ++++++++++++++++++++++++++++++++++-------
1 files changed, 249 insertions(+), 52 deletions(-)
diff --git a/modules/video_filter/sepia.c b/modules/video_filter/sepia.c
index 836d470..dac87cd 100644
--- a/modules/video_filter/sepia.c
+++ b/modules/video_filter/sepia.c
@@ -46,7 +46,8 @@ static void RVSepia( picture_t *, picture_t *, int );
static void PlanarI420Sepia( picture_t *, picture_t *, int);
static void PackedYUVSepia( picture_t *, picture_t *, int);
static picture_t *Filter( filter_t *, picture_t * );
-
+inline void Sepia8ySSE41( uint8_t *, const uint8_t *, volatile uint8_t * );
+inline void Memcpy8BMMX( uint8_t *, const uint8_t * );
static const char *const ppsz_filter_options[] = {
"intensity", NULL
};
@@ -212,44 +213,133 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
// prepared values to copy for U and V channels
const uint8_t filling_const_8u = 128 - i_intensity / 6;
const uint8_t filling_const_8v = 128 + i_intensity / 14;
- /* iterate for every two visible line in the frame */
- for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
+
+ #if defined(CAN_COMPILE_SSE4_1) && 1
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
{
- const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
- const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch;
- const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch;
- const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch;
- // to prevent sigsegv if one pic is smaller (theoretically)
- int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch
- < p_outpic->p[Y_PLANE].i_visible_pitch
- ? (p_pic->p[Y_PLANE].i_visible_pitch - 1) :
- (p_outpic->p[Y_PLANE].i_visible_pitch - 1);
+ /*prepare array of values to copy with mmx, compute only once
+ to improve speed */
+ volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+ i_intensity, i_intensity, i_intensity, i_intensity,
+ i_intensity, i_intensity };
+ const uint8_t filling_array_8u[8] =
+ { filling_const_8u, filling_const_8u, filling_const_8u,
+ filling_const_8u, filling_const_8u, filling_const_8u,
+ filling_const_8u, filling_const_8u };
+ const uint8_t filling_array_8v[8] =
+ { filling_const_8v, filling_const_8v, filling_const_8v
+ filling_const_8v, filling_const_8v, filling_const_8v,
+ filling_const_8v, filling_const_8v };
+
/* iterate for every two visible line in the frame */
- for( int x = 0; x < i_picture_size_limit; x += 2)
+ for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
{
- // y = y - y/4 {to prevent overflow} + intensity / 4
- p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
- p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
- (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
- (i_intensity >> 2);
- p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
- p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
- (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
- (i_intensity >> 2);
- p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
- p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
- (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
- (i_intensity >> 2);
- p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
- p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
- (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
- (i_intensity >> 2);
- // u = 128 {half => B&W} - intensity / 6
- p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
- filling_const_8u;
- // v = 128 {half => B&W} + intensity / 14
- p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
- filling_const_8v;
+ const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+ const int i_dy_line2_start =
+ (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
+ const int i_du_line_start =
+ (y / 2) * p_outpic->p[U_PLANE].i_pitch;
+ const int i_dv_line_start =
+ (y / 2) * p_outpic->p[V_PLANE].i_pitch;
+ int x = 0;
+ /* iterate for every visible line in the frame (eight values at once) */
+ for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16)
+ {
+ /* Compute yellow channel values with asm function */
+ Sepia8ySSE41(
+ &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+ &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
+ intensity_array );
+ Sepia8ySSE41(
+ &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+ &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
+ intensity_array );
+ Sepia8ySSE41(
+ &p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+ &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
+ intensity_array );
+ Sepia8ySSE41(
+ &p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+ &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
+ intensity_array );
+ /* Copy precomputed values to destination image memory location */
+ Memcpy8BMMX(
+ &p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
+ filling_array_8u );
+ Memcpy8BMMX(&p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)],
+ filling_array_8v );
+ }
+ /* Completing the job, the cycle above takes really big chunks, so
+ this makes sure the job will be done completely */
+ for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2)
+ {
+ // y = y - y/4 {to prevent overflow} + intensity / 4
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
+ (i_intensity >> 2);
+ // u = 128 {half => B&W} - intensity / 6
+ p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+ filling_const_8u;
+ // v = 128 {half => B&W} + intensity / 14
+ p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+ filling_const_8v;
+ }
+ }
+ } else
+#endif
+ {
+ /* iterate for every two visible line in the frame */
+ for( int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
+ {
+ const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+ const int i_dy_line2_start = ( y + 1 ) * p_outpic->p[Y_PLANE].i_pitch;
+ const int i_du_line_start = (y/2) * p_outpic->p[U_PLANE].i_pitch;
+ const int i_dv_line_start = (y/2) * p_outpic->p[V_PLANE].i_pitch;
+ // to prevent sigsegv if one pic is smaller (theoretically)
+ int i_picture_size_limit = p_pic->p[Y_PLANE].i_visible_pitch
+ < p_outpic->p[Y_PLANE].i_visible_pitch
+ ? (p_pic->p[Y_PLANE].i_visible_pitch - 1) :
+ (p_outpic->p[Y_PLANE].i_visible_pitch - 1);
+ /* iterate for every two visible line in the frame */
+ for( int x = 0; x < i_picture_size_limit; x += 2)
+ {
+ // y = y - y/4 {to prevent overflow} + intensity / 4
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] >> 2) +
+ (i_intensity >> 2);
+ p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+ p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+ (p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] >> 2) +
+ (i_intensity >> 2);
+ // u = 128 {half => B&W} - intensity / 6
+ p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+ filling_const_8u;
+ // v = 128 {half => B&W} + intensity / 14
+ p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+ filling_const_8v;
+ }
}
}
}
@@ -278,26 +368,89 @@ static void PackedYUVSepia( picture_t *p_pic, picture_t *p_outpic,
p_in_end = p_in + p_pic->p[0].i_visible_lines
* p_pic->p[0].i_pitch;
p_out = p_outpic->p[0].p_pixels;
+#if defined(CAN_COMPILE_SSE4_1)
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+ {
+ /*prepare array of values to copy with mmx, compute only once
+ to improve speed */
+ volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+ i_intensity, i_intensity, i_intensity, i_intensity,
+ i_intensity,
+ i_intensity
+ };
+ const uint8_t filling_array_8u[8] =
+ { filling_const_8u, filling_const_8u,
+ filling_const_8u, filling_const_8u, filling_const_8u,
+ filling_const_8u,
+ filling_const_8u, filling_const_8u
+ };
+ const uint8_t filling_array_8v[8] =
+ { filling_const_8v, filling_const_8v,
+ filling_const_8v, filling_const_8v, filling_const_8v,
+ filling_const_8v,
+ filling_const_8v, filling_const_8v
+ };
- while( p_in < p_in_end )
+ /* iterate for every two visible line in the frame */
+ while (p_in < p_in_end)
+ {
+ p_line_end = p_in + p_pic->p[0].i_visible_pitch;
+ while (p_in < p_line_end)
+ {
+ Sepia8ySSE41(&p_out[i_yindex], &p_in[i_yindex],
+ intensity_array);
+ Sepia8ySSE41(&p_out[i_yindex + 8], &p_in[i_yindex + 8],
+ intensity_array);
+ Sepia8ySSE41(&p_out[i_yindex + 16], &p_in[i_yindex + 16],
+ intensity_array);
+ Sepia8ySSE41(&p_out[i_yindex + 24], &p_in[i_yindex + 24],
+ intensity_array);
+ Memcpy8BMMX(&p_out[i_uindex], filling_array_8u);
+ Memcpy8BMMX(&p_out[i_vindex], filling_array_8v);
+
+ p_in += 32;
+ p_out += 32;
+ }
+ while (p_in < p_line_end)
+ {
+ p_out[i_yindex] =
+ p_in[i_yindex] - (p_in[i_yindex] >> 2) +
+ (i_intensity >> 2);
+ p_out[i_yindex + 2] =
+ p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2) +
+ (i_intensity >> 2);
+ p_out[i_uindex] = filling_const_8u;
+ p_out[i_vindex] = filling_const_8v;
+ p_in += 4;
+ p_out += 4;
+ }
+ p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
+ p_out += p_outpic->p[0].i_pitch
+ - p_outpic->p[0].i_visible_pitch;
+ }
+ } else
+#endif
{
- p_line_end = p_in + p_pic->p[0].i_visible_pitch;
- while( p_in < p_line_end )
+ while( p_in < p_in_end )
{
- /* calculate new, sepia values */
- p_out[i_yindex] =
- p_in[i_yindex] - (p_in[i_yindex] >> 2) + (i_intensity >> 2);
- p_out[i_yindex + 2] =
- p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2)
- + (i_intensity >> 2);
- p_out[i_uindex] = filling_const_8u;
- p_out[i_vindex] = filling_const_8v;
- p_in += 4;
- p_out += 4;
+ p_line_end = p_in + p_pic->p[0].i_visible_pitch;
+ while( p_in < p_line_end )
+ {
+ /* calculate new, sepia values */
+ p_out[i_yindex] =
+ p_in[i_yindex] - (p_in[i_yindex] >> 2) + (i_intensity >> 2);
+ p_out[i_yindex + 2] =
+ p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2)
+ + (i_intensity >> 2);
+ p_out[i_uindex] = filling_const_8u;
+ p_out[i_vindex] = filling_const_8v;
+ p_in += 4;
+ p_out += 4;
+ }
+ p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
+ p_out += p_outpic->p[0].i_pitch
+ - p_outpic->p[0].i_visible_pitch;
}
- p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
- p_out += p_outpic->p[0].i_pitch
- - p_outpic->p[0].i_visible_pitch;
}
}
@@ -369,6 +522,50 @@ static void RVSepia( picture_t *p_pic, picture_t *p_outpic, int i_intensity )
#undef FIX
}
+/*****************************************************************************
+ * Sepia8ySSE41
+ *****************************************************************************
+ * This function applies sepia effect to eight bytes of yellow using SSE4.1
+ * instructions. It copies those 8 bytes to 128b register and fills the gaps
+ * with zeroes and following operations are made with word-operating instructs.
+ *****************************************************************************/
+inline void Sepia8ySSE41(uint8_t * dst, const uint8_t * src,
+ volatile uint8_t * i_intensity)
+{
+#if defined(CAN_COMPILE_SSE4_1) && 1
+ __asm__ volatile (
+ "pmovzxbw (%1), %%xmm1\n" // y = y - y / 4 + i_intensity / 4
+ "pmovzxbw (%1), %%xmm2\n" // store bytes as words with 0s in between
+ "pmovzxbw (%2), %%xmm3\n"
+ "psrlw $2, %%xmm2\n" // rotate right 2
+ "psubusb %%xmm1, %%xmm2\n" // subtract
+ "psrlw $2, %%xmm3\n"
+ "paddsb %%xmm1, %%xmm3\n" // add
+ "packuswb %%xmm2, %%xmm1\n" // pack back to bytes
+ "movq %%xmm1, (%0) \n" // load to dest
+ :
+ :"r" (dst), "r"(src), "r"(i_intensity)
+ :"memory");
+#endif
+}
+
+/*****************************************************************************
+ * Memcpy8BMMX: Copies 8 bytes of memory in two instructions
+ *****************************************************************************
+ * Not quite clean, but it should be fast.
+ *****************************************************************************/
+inline void Memcpy8BMMX(uint8_t * dst, const uint8_t * src)
+{
+#if defined(CAN_COMPILE_MMX) && 1
+ __asm__ volatile (
+ "movq (%1), %%xmm0\n"
+ "movq %%xmm0, (%0)\n"
+ :
+ :"r" (dst), "r"(src)
+ :"memory");
+#endif
+}
+
static int FilterCallback ( vlc_object_t *p_this, char const *psz_var,
vlc_value_t oldval, vlc_value_t newval,
void *p_data )
--
1.7.1
More information about the vlc-devel
mailing list