[vlc-devel] [PATCH 2/2] Added assembly (SSE2, SSE4.1) processing functions to adjust filter

Fri Aug 19 15:04:14 CEST 2011

From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>

---
 modules/video_filter/adjust.c         |   35 ++-
 modules/video_filter/adjust_sat_hue.c |  828 +++++++++++++++++++++++++++++++++
 modules/video_filter/adjust_sat_hue.h |   34 ++
 3 files changed, 893 insertions(+), 4 deletions(-)

diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index bc3e944..034ef22 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -167,15 +167,42 @@ static int Create( vlc_object_t *p_this )
         CASE_PLANAR_YUV
             /* Planar YUV */
             p_filter->pf_video_filter = FilterPlanar;
-            p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
-            p_sys->pf_process_sat_hue = planar_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+            if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+            {
+                p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_SSE41;
+                p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+            }
+            else
+#elif defined( CAN_COMPILE_SSE4_1 )
+            if (vlc_CPU() & CPU_CAPABILITY_SSE2)
+            {
+                p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+            }
+            else
+#endif
+            {
+                p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = planar_sat_hue_C;
+            }
             break;
 
         CASE_PACKED_YUV_422
             /* Packed YUV 4:2:2 */
             p_filter->pf_video_filter = FilterPacked;
-            p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
-            p_sys->pf_process_sat_hue = packed_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+            if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+            {
+                p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_SSE41;
+                p_sys->pf_process_sat_hue = packed_sat_hue_SSE41;
+            }
+            else
+#endif
+            {
+                p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = packed_sat_hue_C;
+            }
             break;
 
         default:
diff --git a/modules/video_filter/adjust_sat_hue.c b/modules/video_filter/adjust_sat_hue.c
index 9a9458c..687854c 100644
--- a/modules/video_filter/adjust_sat_hue.c
+++ b/modules/video_filter/adjust_sat_hue.c
@@ -70,6 +70,834 @@
  * Hue and saturation adjusting routines
  *****************************************************************************/
 
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                                int i_sin, int i_cos, int i_sat, int i_x,
+                                int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    p_in = p_pic->p[U_PLANE].p_pixels;
+    p_in_v = p_pic->p[V_PLANE].p_pixels;
+    p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+                      * p_pic->p[U_PLANE].i_pitch - 8;
+
+    p_out = p_outpic->p[U_PLANE].p_pixels;
+    p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+    uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            /* Do 8 pixels at a time */
+            ADJUST_2_TIMES(
+                __asm__ __volatile__(
+                    "movd              (%[in]),     %%xmm0\n"
+                    "movd            (%[in_v]),     %%xmm1\n"
+                    "punpcklbw          %%xmm7,     %%xmm0\n"
+                    "punpcklwd          %%xmm7,     %%xmm0\n"
+                    "punpcklbw          %%xmm7,     %%xmm1\n"
+                    "punpcklwd          %%xmm7,     %%xmm1\n"
+#if defined(__x86_64__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    /* out_u stays in xmm0 */
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    /* out_v stays in xmm3 */
+#elif defined (__i386__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    /* out_u stays in xmm0 */
+
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    /* out_v stays in xmm3 */
+                    "pxor               %%xmm7,     %%xmm7\n"
+#endif
+                    /* pack and saturate (if there is something to saturate) and
+                    * store in destination */
+                    "packusdw           %%xmm7,     %%xmm0\n"
+                    "packuswb           %%xmm7,     %%xmm0\n"
+                    "movd               %%xmm0,   (%[out])\n"
+                    "packusdw           %%xmm7,     %%xmm3\n"
+                    "packuswb           %%xmm7,     %%xmm3\n"
+                    "movd               %%xmm3, (%[out_v])\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v),
+                  [x]     "r" (i_x),
+                  [y]     "r" (i_y),
+                  [cos]   "r" (i_cos),
+                  [sin]   "r" (i_sin),
+                  [sat]   "r" (i_sat),
+                  [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "eax", "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PLANAR_WRITE_UV_CLIP();
+        }
+
+        p_in += p_pic->p[U_PLANE].i_pitch
+                - p_pic->p[U_PLANE].i_visible_pitch;
+        p_in_v += p_pic->p[V_PLANE].i_pitch
+                - p_pic->p[V_PLANE].i_visible_pitch;
+        p_out += p_outpic->p[U_PLANE].i_pitch
+                - p_outpic->p[U_PLANE].i_visible_pitch;
+        p_out_v += p_outpic->p[V_PLANE].i_pitch
+                    - p_outpic->p[V_PLANE].i_visible_pitch;
+    }
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
+                         int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    int i_y_offset, i_u_offset, i_v_offset;
+    int i_visible_lines, i_pitch, i_visible_pitch;
+
+
+    if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+                              &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+        return VLC_EGENERIC;
+
+    i_visible_lines = p_pic->p->i_visible_lines;
+    i_pitch = p_pic->p->i_pitch;
+    i_visible_pitch = p_pic->p->i_visible_pitch;
+
+    p_in = p_pic->p->p_pixels + i_u_offset;
+    p_in_v = p_pic->p->p_pixels + i_v_offset;
+    p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+    p_out = p_outpic->p->p_pixels + i_u_offset;
+    p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+    uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p->i_visible_pitch - 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            ADJUST_2_TIMES(
+            /* Do 8 pixels at a time */
+                __asm__ __volatile__ (
+                    "movdqu            (%[in]),     %%xmm0\n"
+                    "movdqu          (%[in_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" // 0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" // 0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out unnecessary data
+                    "pand               %%xmm2,     %%xmm1\n"
+#if defined(__x86_64__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+#elif defined (__i386__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+                    "pxor               %%xmm7,     %%xmm7\n"
+#endif
+                    "packusdw           %%xmm7,     %%xmm0\n"
+                    "packuswb           %%xmm7,     %%xmm0\n"
+                    "punpcklbw          %%xmm7,     %%xmm0\n"
+                    "punpcklwd          %%xmm7,     %%xmm0\n"
+                    "packusdw           %%xmm7,     %%xmm3\n"
+                    "packuswb           %%xmm7,     %%xmm3\n"
+                    "punpcklbw          %%xmm7,     %%xmm3\n"
+                    "punpcklwd          %%xmm7,     %%xmm3\n"
+
+                    "movdqu           (%[out]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm2,   (%[out])\n"
+
+                    "movdqu         (%[out_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm3,     %%xmm2\n"
+                    "movdqu             %%xmm2, (%[out_v])\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v),
+                  [x]     "r" (i_x),
+                  [y]     "r" (i_y),
+                  [cos]   "r" (i_cos),
+                  [sin]   "r" (i_sin),
+                  [sat]   "r" (i_sat),
+                  [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "eax", "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PACKED_WRITE_UV_CLIP();
+        }
+
+        p_in += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_in_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_out += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_out_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+    }
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+                      int i_cos, int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    int i_y_offset, i_u_offset, i_v_offset;
+    int i_visible_lines, i_pitch, i_visible_pitch;
+
+    if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+                              &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+        return VLC_EGENERIC;
+
+    i_visible_lines = p_pic->p->i_visible_lines;
+    i_pitch = p_pic->p->i_pitch;
+    i_visible_pitch = p_pic->p->i_visible_pitch;
+
+    p_in = p_pic->p->p_pixels + i_u_offset;
+    p_in_v = p_pic->p->p_pixels + i_v_offset;
+    p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+    p_out = p_outpic->p->p_pixels + i_u_offset;
+    p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+    uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+            p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            ADJUST_2_TIMES(
+            /* Do 8 pixels at a time */
+                __asm__ volatile(
+                    "movdqu            (%[in]),     %%xmm0\n"
+                    "movdqu          (%[in_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" // 0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" // 0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out unnecessary data
+                    "pand               %%xmm2,     %%xmm1\n"
+#if defined(__x86_64__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+#elif defined (__i386__)
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+                    "pxor               %%xmm7,     %%xmm7\n"
+#endif
+
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" // 0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" // 0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out unnecessary data
+                    "pand               %%xmm2,     %%xmm3\n"
+
+                    "movdqu           (%[out]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm2,   (%[out])\n"
+
+                    "movdqu         (%[out_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm3,     %%xmm2\n"
+                    "movdqu             %%xmm2, (%[out_v])\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v),
+                  [x]     "r" (i_x),
+                  [y]     "r" (i_y),
+                  [cos]   "r" (i_cos),
+                  [sin]   "r" (i_sin),
+                  [sat]   "r" (i_sat),
+                  [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "eax", "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PACKED_WRITE_UV();
+        }
+
+            p_in += i_pitch - i_visible_pitch;
+            p_in_v += i_pitch - i_visible_pitch;
+            p_out += i_pitch - i_visible_pitch;
+            p_out_v += i_pitch - i_visible_pitch;
+    }
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
+                         int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    p_in = p_pic->p[U_PLANE].p_pixels;
+    p_in_v = p_pic->p[V_PLANE].p_pixels;
+    p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+                      * p_pic->p[U_PLANE].i_pitch - 8;
+
+    p_out = p_outpic->p[U_PLANE].p_pixels;
+    p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+    uint8_t i_u, i_v;
+
+    __asm__ volatile(
+#if !defined(__x86_64__)
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm3\n"
+        "movd               %[sin],     %%xmm4\n"
+        "pslld                 $16,     %%xmm4\n"
+        "pslld                 $16,     %%xmm3\n"
+        "psrld                 $16,     %%xmm3\n"
+        "pshufd      $0,    %%xmm3,     %%xmm3\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "por                %%xmm4,     %%xmm3\n"
+        "movd               %[sat],     %%xmm4\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "pcmpeqb            %%xmm6,     %%xmm6\n"
+        "psrlw                 $15,     %%xmm6\n"
+        "psllw                  $7,     %%xmm6\n"
+#else
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm6\n"
+        "movd               %[sin],     %%xmm7\n"
+        "pslld                 $16,     %%xmm7\n"
+        "pslld                 $16,     %%xmm6\n"
+        "psrld                 $16,     %%xmm6\n"
+        "pshufd      $0,    %%xmm6,     %%xmm6\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+        "por                %%xmm7,     %%xmm6\n"
+        "movd               %[sat],     %%xmm7\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "pshufd      $0,   %%xmm10,    %%xmm10\n"
+        "pshufd      $0,   %%xmm11,    %%xmm11\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+        "pcmpeqb           %%xmm12,    %%xmm12\n"
+        "pcmpeqb            %%xmm9,     %%xmm9\n"
+        "pcmpeqb           %%xmm13,    %%xmm13\n"
+        "psrlw                 $15,     %%xmm9\n"
+        "psllw                  $7,     %%xmm9\n"
+        "psrlw                 $15,    %%xmm13\n"
+        "mov               $0x8000,      %%eax\n"
+        "movd                %%eax,    %%xmm14\n"
+        "pshufd      $0,   %%xmm14,    %%xmm14\n"
+        "mov           $0x80008000,      %%eax\n"
+        "movd                %%eax,    %%xmm15\n"
+        "pshufd      $0,   %%xmm15,    %%xmm15\n"
+#endif
+    :
+    :
+      [x]      "r" ( i_x ),
+      [y]      "r" ( i_y ),
+      [sat]    "r" ( i_sat * 0x10001 ),
+      [sin]    "r" ( i_sin ),
+      [cos]    "r" ( i_cos )
+    : "eax", "memory" );
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            /* Do 8 pixels at a time */
+            __asm__ __volatile__ (
+                "movd              (%[in]),     %%xmm1\n"
+                "movd            (%[in_v]),     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm1\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "punpcklwd          %%xmm2,     %%xmm1\n"
+                /////////////////////////////////////////
+                "movd             4(%[in]),     %%xmm2\n"
+                "movd           4(%[in_v]),     %%xmm5\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm2\n"
+                /////////////////////////////////////////
+#ifdef __i386__
+                "pmaddwd            %%xmm3,     %%xmm1\n"
+                "pmaddwd            %%xmm3,     %%xmm2\n"
+                "movd                 %[x],     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "pslld                  $8,     %%xmm1\n"
+                "psrld                 $16,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "psrld                 $16,     %%xmm2\n"
+                "mov               $0x8000,      %%eax\n"
+                "movd                %%eax,     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+                "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+                "paddw              %%xmm5,     %%xmm1\n"
+                "pmullw             %%xmm4,     %%xmm1\n"
+                "psraw                  $8,     %%xmm1\n"
+                "paddw              %%xmm6,     %%xmm1\n"
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "movq               %%xmm1,   (%[out])\n"
+                "pcmpeqb            %%xmm6,     %%xmm6\n"
+                "movq              (%[in]),     %%xmm5\n"
+                "movd            (%[in_v]),     %%xmm1\n"
+                "movd           4(%[in_v]),     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "pandn              %%xmm6,     %%xmm5\n"
+                "punpcklbw          %%xmm0,     %%xmm1\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "paddw              %%xmm6,     %%xmm5\n"
+                "psrlw                 $15,     %%xmm6\n"
+                "punpcklwd          %%xmm5,     %%xmm1\n"
+                "punpckhqdq         %%xmm5,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm2\n"
+
+                "psllw                  $7,     %%xmm6\n"
+                "pmaddwd            %%xmm3,     %%xmm1\n"
+                "pmaddwd            %%xmm3,     %%xmm2\n"
+                "movd                 %[y],     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "pslld                  $8,     %%xmm1\n"
+                "psrld                 $16,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "psrld                 $16,     %%xmm2\n"
+
+                "mov               $0x8000,      %%eax\n"
+                "movd                %%eax,     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+                "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+                "paddw              %%xmm5,     %%xmm1\n"
+                "pmullw             %%xmm4,     %%xmm1\n"
+                "psraw                  $8,     %%xmm1\n"
+                "paddw              %%xmm6,     %%xmm1\n"
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "movq               %%xmm1,  (%[out_v])\n"
+#elif defined(__x86_64__)
+                /////////////////////////////////////////
+                "movq              (%[in]),     %%xmm5\n"
+                "movd            (%[in_v]),     %%xmm3\n"
+                "movd           4(%[in_v]),     %%xmm4\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "pandn             %%xmm12,     %%xmm5\n" // invert U (to be subtracted)
+                "punpcklbw          %%xmm0,     %%xmm3\n"
+                "paddw             %%xmm13,     %%xmm5\n" // add 1
+                "punpcklbw          %%xmm0,     %%xmm4\n"
+                "punpcklwd          %%xmm5,     %%xmm3\n"
+                "punpckhqdq         %%xmm5,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm4\n"
+                /////////////////////////////////////////
+                "pmaddwd            %%xmm6,     %%xmm1\n"
+                "pmaddwd            %%xmm6,     %%xmm2\n"
+                "pmaddwd            %%xmm6,     %%xmm3\n"
+                "pmaddwd            %%xmm6,     %%xmm4\n"
+                "psubd             %%xmm10,     %%xmm1\n"
+                "psubd             %%xmm10,     %%xmm2\n"
+                "psubd             %%xmm11,     %%xmm3\n"
+                "psubd             %%xmm11,     %%xmm4\n"
+                "pslld                  $8,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "pslld                  $8,     %%xmm3\n"
+                "pslld                  $8,     %%xmm4\n"
+                "psrld                 $16,     %%xmm1\n"
+                "psrld                 $16,     %%xmm2\n"
+                "psrld                 $16,     %%xmm3\n"
+                "psrld                 $16,     %%xmm4\n"
+                "psubd             %%xmm14,     %%xmm1\n"
+                "psubd             %%xmm14,     %%xmm2\n"
+                "psubd             %%xmm14,     %%xmm3\n"
+                "psubd             %%xmm14,     %%xmm4\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "packssdw           %%xmm4,     %%xmm3\n"
+                "paddw             %%xmm15,     %%xmm1\n"
+                "paddw             %%xmm15,     %%xmm3\n"
+                "pmullw             %%xmm7,     %%xmm1\n"
+                "pmullw             %%xmm7,     %%xmm3\n"
+                "psraw                  $8,     %%xmm1\n"
+                "psraw                  $8,     %%xmm3\n"
+                "paddw              %%xmm9,     %%xmm1\n"
+                "paddw              %%xmm9,     %%xmm3\n"
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "packuswb           %%xmm0,     %%xmm3\n"
+                "movq               %%xmm1,   (%[out])\n"
+                "movq               %%xmm3, (%[out_v])\n"
+#endif
+            :
+            : [in]    "r" (p_in),
+              [in_v]  "r" (p_in_v),
+              [x]     "r" (i_x),
+              [y]     "r" (i_y),
+              [cos]   "r" (i_cos),
+              [sin]   "r" (i_sin),
+              [sat]   "r" (i_sat),
+              [out]   "r" (p_out),
+              [out_v] "r" (p_out_v)
+            : "eax", "memory" );
+            p_in += 8;
+            p_in_v += 8;
+            p_out += 8;
+            p_out_v += 8;
+        }
+
+        p_line_end += 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PLANAR_WRITE_UV();
+        }
+
+        p_in += p_pic->p[U_PLANE].i_pitch
+                - p_pic->p[U_PLANE].i_visible_pitch;
+        p_in_v += p_pic->p[V_PLANE].i_pitch
+                - p_pic->p[V_PLANE].i_visible_pitch;
+        p_out += p_outpic->p[U_PLANE].i_pitch
+                - p_outpic->p[U_PLANE].i_visible_pitch;
+        p_out_v += p_outpic->p[V_PLANE].i_pitch
+                    - p_outpic->p[V_PLANE].i_visible_pitch;
+    }
+
+    return VLC_SUCCESS;
+}
+#endif
+
 int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
                          int i_sat, int i_x, int i_y )
 {
diff --git a/modules/video_filter/adjust_sat_hue.h b/modules/video_filter/adjust_sat_hue.h
index 1e177fe..1ac51c2 100644
--- a/modules/video_filter/adjust_sat_hue.h
+++ b/modules/video_filter/adjust_sat_hue.h
@@ -39,6 +39,40 @@
  */
 
 /**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                               int i_sin, int i_cos, int i_sat, int i_x,
+                               int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                               int i_sin, int i_cos, int i_sat, int i_x,
+                               int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat <= 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+                          int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+                          int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
  * Basic C compiler generated function for planar format, i_sat > 256
  */
 int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic,
-- 
1.7.1