[vlc-devel] [PATCH 2/2] Sepia video filter assembly speed up Added functions Sepia8ySSE41 and Memcpy8BMMX to use the new formulas of sepia computation Uses SSE4.1 and works witch chunks of 8 bytes

Sun Apr 3 19:35:02 CEST 2011

From: Martin Briza <gamajun at seznam.cz>

---
 modules/video_filter/sepia.c |  202 ++++++++++++++++++++++++++++++++++++++++++
 1 files changed, 202 insertions(+), 0 deletions(-)

diff --git a/modules/video_filter/sepia.c b/modules/video_filter/sepia.c
index cc6df5c..7002229 100644
--- a/modules/video_filter/sepia.c
+++ b/modules/video_filter/sepia.c
@@ -48,6 +48,9 @@ static void RVSepia(picture_t *, picture_t *, int);
 static void PlanarI420Sepia(picture_t *, picture_t *, int);
 static void PackedYUVSepia(picture_t *, picture_t *, int);
 static picture_t *Filter(filter_t *, picture_t *);
+inline void Sepia8ySSE41(uint8_t * dst, const uint8_t * src,
+               volatile uint8_t * i_intensity);
+inline void Memcpy8BMMX(uint8_t * dst, const uint8_t * src);
 static const char *const ppsz_filter_options[] = {
     "intensity", NULL
 };
@@ -202,6 +205,44 @@ static picture_t *Filter(filter_t * p_filter, picture_t * p_pic)
 }
 
 /*****************************************************************************
+ * Sepia8ySSE41
+ *****************************************************************************
+ * This function applies sepia effect to eight bytes of yellow using SSE4.1
+ * instructions. It copies those 8 bytes to 128b register and fills the gaps
+ * with zeroes and following operations are made with word-operating instructs.
+ *****************************************************************************/
+inline void Sepia8ySSE41(uint8_t * dst, const uint8_t * src,
+               volatile uint8_t * i_intensity)
+{
+#if defined(CAN_COMPILE_SSE4_1) && 1
+    __asm__ volatile ("pmovzxbw      (%1),   %%xmm1\n"    // y = y - y / 4 + i_intensity / 4
+              "pmovzxbw      (%1),   %%xmm2\n"    // store bytes as words with 0s in between
+              "pmovzxbw      (%2),   %%xmm3\n" "psrlw          $2,    %%xmm2\n"    // rotate right 2
+              "psubusb       %%xmm1, %%xmm2\n"    // subtract
+              "psrlw          $2,    %%xmm3\n" "paddsb        %%xmm1, %%xmm3\n"    // add
+              "packuswb      %%xmm2, %%xmm1\n"    // pack back to bytes
+              "movq          %%xmm1, (%0)  \n"    // load to dest
+              ::"r" (dst), "r"(src), "r"(i_intensity)
+              :"memory");
+#endif
+}
+
+/*****************************************************************************
+ * Memcpy8BMMX: Copies 8 bytes of memory in two instructions
+ *****************************************************************************
+ * Not quite clean, but it should be fast.
+ *****************************************************************************/
+inline void Memcpy8BMMX(uint8_t * dst, const uint8_t * src)
+{
+#if defined(CAN_COMPILE_MMX) && 1
+    __asm__ volatile ("movq       (%1), %%xmm0\n"
+              "movq       %%xmm0, (%0)\n"::"r" (dst), "r"(src)
+              :"memory");
+#endif
+}
+
+
+/*****************************************************************************
  * PlanarI420Sepia: Applies sepia to one frame of the planar I420 video
  *****************************************************************************
  * This function applies sepia effect to one frame of the video by iterating
@@ -216,6 +257,104 @@ static void PlanarI420Sepia(picture_t * p_pic, picture_t * p_outpic,
     const uint8_t filling_const_8u = 128 - i_intensity / 6;
     const uint8_t filling_const_8v = 128 + i_intensity / 14;
 
+#if defined(CAN_COMPILE_SSE4_1) && 1
+    if (vlc_CPU() & CPU_CAPABILITY_SSE4_1) {
+    /*prepare array of values to copy with mmx, compute only once
+       to improve speed */
+    volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+        i_intensity, i_intensity, i_intensity, i_intensity,
+        i_intensity,
+        i_intensity
+    };
+    const uint8_t filling_array_8u[8] =
+        { filling_const_8u, filling_const_8u,
+        filling_const_8u, filling_const_8u, filling_const_8u,
+        filling_const_8u,
+        filling_const_8u, filling_const_8u
+    };
+    const uint8_t filling_array_8v[8] =
+        { filling_const_8v, filling_const_8v,
+        filling_const_8v, filling_const_8v, filling_const_8v,
+        filling_const_8v,
+        filling_const_8v, filling_const_8v
+    };
+
+    /* iterate for every two visible line in the frame */
+    for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) {
+        const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_dy_line2_start =
+        (y + 1) * p_outpic->p[Y_PLANE].i_pitch;
+        const int i_du_line_start =
+        (y / 2) * p_outpic->p[U_PLANE].i_pitch;
+        const int i_dv_line_start =
+        (y / 2) * p_outpic->p[V_PLANE].i_pitch;
+        int x = 0;
+        /* iterate for every visible line in the frame (eight values at once) */
+        for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 15; x += 16) {
+        /* Compute yellow channel values with asm function */
+        Sepia8ySSE41(&p_outpic->p[Y_PLANE].
+                   p_pixels[i_dy_line1_start + x],
+                   &p_pic->p[Y_PLANE].
+                   p_pixels[i_dy_line1_start + x],
+                   intensity_array);
+        Sepia8ySSE41(&p_outpic->p[Y_PLANE].
+                   p_pixels[i_dy_line2_start + x],
+                   &p_pic->p[Y_PLANE].
+                   p_pixels[i_dy_line2_start + x],
+                   intensity_array);
+        Sepia8ySSE41(&p_outpic->p[Y_PLANE].
+                   p_pixels[i_dy_line1_start + x + 8],
+                   &p_pic->p[Y_PLANE].
+                   p_pixels[i_dy_line1_start + x + 8],
+                   intensity_array);
+        Sepia8ySSE41(&p_outpic->p[Y_PLANE].
+                   p_pixels[i_dy_line2_start + x + 8],
+                   &p_pic->p[Y_PLANE].
+                   p_pixels[i_dy_line2_start + x + 8],
+                   intensity_array);
+        /* Copy precomputed values to destination image memory location */
+        Memcpy8BMMX(&p_outpic->p[U_PLANE].
+                  p_pixels[i_du_line_start + (x / 2)],
+                  filling_array_8u);
+        Memcpy8BMMX(&p_outpic->p[V_PLANE].
+                  p_pixels[i_dv_line_start + (x / 2)],
+                  filling_array_8v);
+        }
+        /* Completing the job, the cycle above takes really big chunks, so
+           this makes sure the job will be done completely */
+        for (; x < p_pic->p[Y_PLANE].i_visible_pitch - 1; x += 2) {
+        // y = y - y/4 {to prevent overflow} + intensity / 4
+        p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] =
+            p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x] -
+            (p_pic->p[Y_PLANE].
+             p_pixels[i_dy_line1_start + x] >> 2) +
+            (i_intensity >> 2);
+        p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] =
+            p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 1] -
+            (p_pic->p[Y_PLANE].
+             p_pixels[i_dy_line1_start + x + 1] >> 2) +
+            (i_intensity >> 2);
+        p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] =
+            p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x] -
+            (p_pic->p[Y_PLANE].
+             p_pixels[i_dy_line2_start + x] >> 2) +
+            (i_intensity >> 2);
+        p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] =
+            p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 1] -
+            (p_pic->p[Y_PLANE].
+             p_pixels[i_dy_line2_start + x + 1] >> 2) +
+            (i_intensity >> 2);
+        // u = 128 {half => B&W} - intensity / 6
+        p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)] =
+            filling_const_8u;
+        // v = 128 {half => B&W} + intensity / 14
+        p_outpic->p[V_PLANE].p_pixels[i_dv_line_start + (x / 2)] =
+            filling_const_8v;
+        }
+    }
+    } else
+#endif
+    {
     for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2) {
         const int i_dy_line1_start = y * p_outpic->p[Y_PLANE].i_pitch;
         const int i_dy_line2_start =
@@ -256,6 +395,7 @@ static void PlanarI420Sepia(picture_t * p_pic, picture_t * p_outpic,
             filling_const_8v;
         }
     }
+    }
 }
 
 
@@ -283,6 +423,67 @@ static void PackedYUVSepia(picture_t * p_pic, picture_t * p_outpic,
     p_in_end = p_in + p_pic->p[0].i_visible_lines * p_pic->p[0].i_pitch;
     p_out = p_outpic->p[0].p_pixels;
 
+
+#if defined(CAN_COMPILE_SSE4_1)
+    if (vlc_CPU() & CPU_CAPABILITY_SSE4_1) {
+    /*prepare array of values to copy with mmx, compute only once
+       to improve speed */
+    volatile uint8_t intensity_array[8] = { i_intensity, i_intensity,
+        i_intensity, i_intensity, i_intensity, i_intensity,
+        i_intensity,
+        i_intensity
+    };
+    const uint8_t filling_array_8u[8] =
+        { filling_const_8u, filling_const_8u,
+        filling_const_8u, filling_const_8u, filling_const_8u,
+        filling_const_8u,
+        filling_const_8u, filling_const_8u
+    };
+    const uint8_t filling_array_8v[8] =
+        { filling_const_8v, filling_const_8v,
+        filling_const_8v, filling_const_8v, filling_const_8v,
+        filling_const_8v,
+        filling_const_8v, filling_const_8v
+    };
+
+    /* iterate for every two visible line in the frame */
+    while (p_in < p_in_end) {
+        p_line_end = p_in + p_pic->p[0].i_visible_pitch;
+        while (p_in < p_line_end) {
+        Sepia8ySSE41(&p_out[i_yindex], &p_in[i_yindex],
+                   intensity_array);
+        Sepia8ySSE41(&p_out[i_yindex + 8], &p_in[i_yindex + 8],
+                   intensity_array);
+        Sepia8ySSE41(&p_out[i_yindex + 16], &p_in[i_yindex + 16],
+                   intensity_array);
+        Sepia8ySSE41(&p_out[i_yindex + 24], &p_in[i_yindex + 24],
+                   intensity_array);
+        Memcpy8BMMX(&p_out[i_uindex], filling_array_8u);
+        Memcpy8BMMX(&p_out[i_vindex], filling_array_8v);
+
+        p_in += 32;
+        p_out += 32;
+        }
+        while (p_in < p_line_end) {
+        p_out[i_yindex] =
+            p_in[i_yindex] - (p_in[i_yindex] >> 2) +
+            (i_intensity >> 2);
+        p_out[i_yindex + 2] =
+            p_in[i_yindex + 2] - (p_in[i_yindex + 2] >> 2) +
+            (i_intensity >> 2);
+        p_out[i_uindex] = filling_const_8u;
+        p_out[i_vindex] = filling_const_8v;
+        p_in += 4;
+        p_out += 4;
+        }
+        p_in += p_pic->p[0].i_pitch - p_pic->p[0].i_visible_pitch;
+        p_out += p_outpic->p[0].i_pitch
+        - p_outpic->p[0].i_visible_pitch;
+    }
+    } else
+#endif
+
+    {
     while (p_in < p_in_end) {
         p_line_end = p_in + p_pic->p[0].i_visible_pitch;
         while (p_in < p_line_end) {
@@ -302,6 +503,7 @@ static void PackedYUVSepia(picture_t * p_pic, picture_t * p_outpic,
         p_out += p_outpic->p[0].i_pitch
         - p_outpic->p[0].i_visible_pitch;
     }
+    }
 }
 
 /*****************************************************************************
-- 
1.7.1