[vlc-commits] sepia: fix lack of Y adjustment

Wed Mar 20 15:34:48 CET 2019

vlc | branch: master | Lyndon Brown <jnqnfe at gmail.com> | Sun Mar 17 00:55:19 2019 +0000| [464e2d440348d74904c53c42aeb366dce78e3394] | committer: Thomas Guillem

sepia: fix lack of Y adjustment

The SIMD accelerated PlanarI420 implementation contained mistakes that
meant that actually Y was being written untouched.

Signed-off-by: Thomas Guillem <thomas at gllm.fr>

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=464e2d440348d74904c53c42aeb366dce78e3394
---

 modules/video_filter/sepia.c | 36 ++++++++++++++++--------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/modules/video_filter/sepia.c b/modules/video_filter/sepia.c
index 5f4a4c61fb..9ca567c614 100644
--- a/modules/video_filter/sepia.c
+++ b/modules/video_filter/sepia.c
@@ -201,24 +201,22 @@ static picture_t *Filter( filter_t *p_filter, picture_t *p_pic )
  *****************************************************************************/
 VLC_SSE
 static inline void Sepia8ySSE2(uint8_t * dst, const uint8_t * src,
-                         int i_intensity_spread)
+                         int i_intensity_shifted_pair)
 {
     __asm__ volatile (
         // y = y - y / 4 + i_intensity / 4
         "movq            (%1), %%xmm1\n"
-        "punpcklbw     %%xmm7, %%xmm1\n"
-        "movq            (%1), %%xmm2\n" // store bytes as words with 0s in between
-        "punpcklbw     %%xmm7, %%xmm2\n"
+        "punpcklbw     %%xmm7, %%xmm1\n" // zero-extend bytes to words
+        "movdqa        %%xmm1, %%xmm2\n" // copy it
         "movd              %2, %%xmm3\n"
         "pshufd    $0, %%xmm3, %%xmm3\n"
-        "psrlw             $2, %%xmm2\n"    // rotate right 2
-        "psubusb       %%xmm1, %%xmm2\n"    // subtract
-        "psrlw             $2, %%xmm3\n"
-        "paddsb        %%xmm1, %%xmm3\n"    // add
-        "packuswb      %%xmm2, %%xmm1\n"    // pack back to bytes
-        "movq          %%xmm1, (%0)  \n"    // load to dest
+        "psrlw             $2, %%xmm2\n" // get 1/4 of it
+        "psubusb       %%xmm2, %%xmm1\n"
+        "paddusb       %%xmm3, %%xmm1\n"
+        "packuswb      %%xmm1, %%xmm1\n" // pack back to bytes
+        "movq          %%xmm1, (%0)  \n"
         :
-        :"r" (dst), "r"(src), "r"(i_intensity_spread)
+        :"r" (dst), "r"(src), "r"(i_intensity_shifted_pair)
         :"memory", "xmm1", "xmm2", "xmm3");
 }
 
@@ -230,11 +228,9 @@ static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
     const uint8_t filling_const_8u = 128 - i_intensity / 6;
     const uint8_t filling_const_8v = 128 + i_intensity / 14;
     /* prepared value for faster broadcasting in xmm register */
-    int i_intensity_spread = 0x10001 * (uint8_t) i_intensity;
+    int i_intensity_shifted_pair = 0x10001 * (((uint8_t) i_intensity) >> 2);
 
-    __asm__ volatile(
-        "pxor      %%xmm7, %%xmm7\n"
-        ::: "xmm7");
+    __asm__ volatile("pxor %%xmm7, %%xmm7\n" ::: "xmm7");
 
     /* iterate for every two visible line in the frame */
     for (int y = 0; y < p_pic->p[Y_PLANE].i_visible_lines - 1; y += 2)
@@ -250,16 +246,16 @@ static void PlanarI420SepiaSSE( picture_t *p_pic, picture_t *p_outpic,
             /* Compute yellow channel values with asm function */
             Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
                         &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x],
-                        i_intensity_spread );
+                        i_intensity_shifted_pair );
             Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
                         &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x],
-                        i_intensity_spread );
+                        i_intensity_shifted_pair );
             Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
                         &p_pic->p[Y_PLANE].p_pixels[i_dy_line1_start + x + 8],
-                        i_intensity_spread );
+                        i_intensity_shifted_pair );
             Sepia8ySSE2(&p_outpic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
                         &p_pic->p[Y_PLANE].p_pixels[i_dy_line2_start + x + 8],
-                        i_intensity_spread );
+                        i_intensity_shifted_pair );
             /* Copy precomputed values to destination memory location */
             memset(&p_outpic->p[U_PLANE].p_pixels[i_du_line_start + (x / 2)],
                    filling_const_8u, 8 );
@@ -363,7 +359,7 @@ static void PlanarI420Sepia( picture_t *p_pic, picture_t *p_outpic,
 /*****************************************************************************
  * PackedYUVSepia: Applies sepia to one frame of the packed YUV video
  *****************************************************************************
- * This function applies sepia effext to one frame of the video by iterating
+ * This function applies sepia effect to one frame of the video by iterating
  * through video lines. In every pass, we calculate new values for pixels
  * (UYVY, VYUY, YUYV and YVYU formats are supported)
  *****************************************************************************/