[vlc-devel] [PATCH-fixed] Added assembly (SSE2, SSE4.1) processing functions to adjust filter

Thu Aug 25 00:11:59 CEST 2011

From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>

---
  modules/video_filter/adjust.c         |   35 ++-
  modules/video_filter/adjust_sat_hue.c |  926  
+++++++++++++++++++++++++++++++++
  modules/video_filter/adjust_sat_hue.h |   34 ++
  3 files changed, 991 insertions(+), 4 deletions(-)

diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index c2f6649..a741d24 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -166,15 +166,42 @@ static int Create( vlc_object_t *p_this )
          CASE_PLANAR_YUV
              /* Planar YUV */
              p_filter->pf_video_filter = FilterPlanar;
-            p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
-            p_sys->pf_process_sat_hue = planar_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+            if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+            {
+                p_sys->pf_process_sat_hue_clip =  
planar_sat_hue_clip_SSE41;
+                p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+            }
+            else
+#elif defined( CAN_COMPILE_SSE4_1 )
+            if (vlc_CPU() & CPU_CAPABILITY_SSE2)
+            {
+                p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+            }
+            else
+#endif
+            {
+                p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = planar_sat_hue_C;
+            }
              break;

          CASE_PACKED_YUV_422
              /* Packed YUV 4:2:2 */
              p_filter->pf_video_filter = FilterPacked;
-            p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
-            p_sys->pf_process_sat_hue = packed_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+            if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+            {
+                p_sys->pf_process_sat_hue_clip =  
packed_sat_hue_clip_SSE41;
+                p_sys->pf_process_sat_hue = packed_sat_hue_SSE41;
+            }
+            else
+#endif
+            {
+                p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
+                p_sys->pf_process_sat_hue = packed_sat_hue_C;
+            }
              break;

          default:
diff --git a/modules/video_filter/adjust_sat_hue.c  
b/modules/video_filter/adjust_sat_hue.c
index cbc6f13..75c783f 100644
--- a/modules/video_filter/adjust_sat_hue.c
+++ b/modules/video_filter/adjust_sat_hue.c
@@ -66,10 +66,936 @@
  #define ADJUST_4_TIMES(x) x; x; x; x
  #define ADJUST_8_TIMES(x) x; x; x; x; x; x; x; x

+#ifdef _WIN64
+#define STORE_XMM_REGISTERS \
+    static uint64_t xmm_temporary_storage[32]; \
+    __asm__ volatile( \
+        "movdqa              %%xmm0,    (%[x])\n" \
+        "movdqa              %%xmm1,  16(%[x])\n" \
+        "movdqa              %%xmm2,  32(%[x])\n" \
+        "movdqa              %%xmm3,  48(%[x])\n" \
+        "movdqa              %%xmm4,  64(%[x])\n" \
+        "movdqa              %%xmm5,  80(%[x])\n" \
+        "movdqa              %%xmm6,  96(%[x])\n" \
+        "movdqa              %%xmm7, 112(%[x])\n" \
+        "movdqa              %%xmm8, 128(%[x])\n" \
+        "movdqa              %%xmm9, 144(%[x])\n" \
+        "movdqa             %%xmm10, 160(%[x])\n" \
+        "movdqa             %%xmm11, 176(%[x])\n" \
+        "movdqa             %%xmm12, 192(%[x])\n" \
+        "movdqa             %%xmm13, 208(%[x])\n" \
+        "movdqa             %%xmm14, 224(%[x])\n" \
+        "movdqa             %%xmm15, 240(%[x])\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#else
+#define STORE_XMM_REGISTERS
+#endif
+
+#ifdef _WIN64
+#define RESTORE_XMM_REGISTERS \
+    __asm__ volatile( \
+        "movdqa              (%[x]),    %%xmm0\n" \
+        "movdqa            16(%[x]),    %%xmm1\n" \
+        "movdqa            32(%[x]),    %%xmm2\n" \
+        "movdqa            48(%[x]),    %%xmm3\n" \
+        "movdqa            64(%[x]),    %%xmm4\n" \
+        "movdqa            80(%[x]),    %%xmm5\n" \
+        "movdqa            96(%[x]),    %%xmm6\n" \
+        "movdqa           112(%[x]),    %%xmm7\n" \
+        "movdqa           128(%[x]),    %%xmm8\n" \
+        "movdqa           144(%[x]),    %%xmm9\n" \
+        "movdqa           160(%[x]),   %%xmm10\n" \
+        "movdqa           176(%[x]),   %%xmm11\n" \
+        "movdqa           192(%[x]),   %%xmm12\n" \
+        "movdqa           208(%[x]),   %%xmm13\n" \
+        "movdqa           224(%[x]),   %%xmm14\n" \
+        "movdqa           240(%[x]),   %%xmm15\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#else
+#define RESTORE_XMM_REGISTERS
+#endif
+
  /*****************************************************************************
   * Hue and saturation adjusting routines
   *****************************************************************************/

+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                                int i_sin, int i_cos, int i_sat, int i_x,
+                                int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    p_in = p_pic->p[U_PLANE].p_pixels;
+    p_in_v = p_pic->p[V_PLANE].p_pixels;
+    p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+                      * p_pic->p[U_PLANE].i_pitch - 8;
+
+    p_out = p_outpic->p[U_PLANE].p_pixels;
+    p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+    uint8_t i_u, i_v;
+
+    STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            /* Do 8 pixels at a time */
+            ADJUST_2_TIMES(
+                __asm__ __volatile__(
+                    "movd              (%[in]),     %%xmm0\n"
+                    "movd            (%[in_v]),     %%xmm1\n"
+                    "punpcklbw          %%xmm7,     %%xmm0\n"
+                    "punpcklwd          %%xmm7,     %%xmm0\n"
+                    "punpcklbw          %%xmm7,     %%xmm1\n"
+                    "punpcklwd          %%xmm7,     %%xmm1\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v)
+                : "memory" );
+#if defined(__x86_64__)
+                __asm__ __volatile__(
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    /* out_u stays in xmm0 */
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+                    /* out_v stays in xmm3 */
+                :
+                : );
+#elif defined (__i386__)
+                __asm__ __volatile__(
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    /* out_u stays in xmm0 */
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    /* out_v stays in xmm3 */
+                    "pxor               %%xmm7,     %%xmm7\n"
+                :
+                : [x]     "r" (i_x),
+                  [y]     "r" (i_y)
+                : "memory" );
+#endif
+                __asm__ __volatile__(
+                    /* pack and saturate (if there is something to  
saturate) and
+                    * store in destination */
+                    "packusdw           %%xmm7,     %%xmm0\n"
+                    "packuswb           %%xmm7,     %%xmm0\n"
+                    "movd               %%xmm0,   (%[out])\n"
+                    "packusdw           %%xmm7,     %%xmm3\n"
+                    "packuswb           %%xmm7,     %%xmm3\n"
+                    "movd               %%xmm3, (%[out_v])\n"
+                :
+                : [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "eax", "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PLANAR_WRITE_UV_CLIP();
+        }
+
+        p_in += p_pic->p[U_PLANE].i_pitch
+                - p_pic->p[U_PLANE].i_visible_pitch;
+        p_in_v += p_pic->p[V_PLANE].i_pitch
+                - p_pic->p[V_PLANE].i_visible_pitch;
+        p_out += p_outpic->p[U_PLANE].i_pitch
+                - p_outpic->p[U_PLANE].i_visible_pitch;
+        p_out_v += p_outpic->p[V_PLANE].i_pitch
+                    - p_outpic->p[V_PLANE].i_visible_pitch;
+    }
+
+    RESTORE_XMM_REGISTERS;
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,  
int i_sin, int i_cos,
+                         int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    int i_y_offset, i_u_offset, i_v_offset;
+    int i_visible_lines, i_pitch, i_visible_pitch;
+
+
+    if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+                              &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+        return VLC_EGENERIC;
+
+    i_visible_lines = p_pic->p->i_visible_lines;
+    i_pitch = p_pic->p->i_pitch;
+    i_visible_pitch = p_pic->p->i_visible_pitch;
+
+    p_in = p_pic->p->p_pixels + i_u_offset;
+    p_in_v = p_pic->p->p_pixels + i_v_offset;
+    p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+    p_out = p_outpic->p->p_pixels + i_u_offset;
+    p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+    uint8_t i_u, i_v;
+
+    STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p->i_visible_pitch - 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            ADJUST_2_TIMES(
+            /* Do 8 pixels at a time */
+                __asm__ __volatile__ (
+                    "movdqu            (%[in]),     %%xmm0\n"
+                    "movdqu          (%[in_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" //  
0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" //  
0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out  
unnecessary data
+                    "pand               %%xmm2,     %%xmm1\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v)
+                : "memory" );
+
+#if defined(__x86_64__)
+                __asm__ __volatile__ (
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+                    // out_v stays in xmm3
+                :
+                : );
+#elif defined (__i386__)
+                __asm__ __volatile__ (
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+                    "pxor               %%xmm7,     %%xmm7\n"
+                :
+                : [x]     "r" (i_x),
+                  [y]     "r" (i_y)
+                : "memory");
+#endif
+                __asm__ __volatile__ (
+                    "packusdw           %%xmm7,     %%xmm0\n"
+                    "packuswb           %%xmm7,     %%xmm0\n"
+                    "punpcklbw          %%xmm7,     %%xmm0\n"
+                    "punpcklwd          %%xmm7,     %%xmm0\n"
+                    "packusdw           %%xmm7,     %%xmm3\n"
+                    "packuswb           %%xmm7,     %%xmm3\n"
+                    "punpcklbw          %%xmm7,     %%xmm3\n"
+                    "punpcklwd          %%xmm7,     %%xmm3\n"
+
+                    "movdqu           (%[out]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm2,   (%[out])\n"
+
+                    "movdqu         (%[out_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm3,     %%xmm2\n"
+                    "movdqu             %%xmm2, (%[out_v])\n"
+                :
+                : [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PACKED_WRITE_UV_CLIP();
+        }
+
+        p_in += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_in_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_out += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+        p_out_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+    }
+
+    RESTORE_XMM_REGISTERS;
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int  
i_sin,
+                      int i_cos, int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    int i_y_offset, i_u_offset, i_v_offset;
+    int i_visible_lines, i_pitch, i_visible_pitch;
+
+    if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+                              &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+        return VLC_EGENERIC;
+
+    i_visible_lines = p_pic->p->i_visible_lines;
+    i_pitch = p_pic->p->i_pitch;
+    i_visible_pitch = p_pic->p->i_visible_pitch;
+
+    p_in = p_pic->p->p_pixels + i_u_offset;
+    p_in_v = p_pic->p->p_pixels + i_v_offset;
+    p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+    p_out = p_outpic->p->p_pixels + i_u_offset;
+    p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+    uint8_t i_u, i_v;
+
+    STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+
+    for( ; p_in < p_in_end ; )
+    {
+            p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            ADJUST_2_TIMES(
+            /* Do 8 pixels at a time */
+                __asm__ __volatile__(
+                    "movdqu            (%[in]),     %%xmm0\n"
+                    "movdqu          (%[in_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" //  
0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" //  
0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out  
unnecessary data
+                    "pand               %%xmm2,     %%xmm1\n"
+                :
+                : [in]    "r" (p_in),
+                  [in_v]  "r" (p_in_v)
+                : "memory" );
+#if defined(__x86_64__)
+                __asm__ __volatile__(
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "pmulld             %%xmm8,     %%xmm0\n"
+                    "pmulld             %%xmm9,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd             %%xmm10,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld            %%xmm12,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+                    "pmulld             %%xmm8,     %%xmm3\n"
+                    "pmulld             %%xmm9,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd             %%xmm11,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld            %%xmm12,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+                    // out_v stays in xmm3
+                :
+                : );
+#elif defined (__i386__)
+                __asm__ __volatile__(
+                    "movdqu             %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm1,     %%xmm3\n"
+
+                    "movd                 %[x],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm0\n"
+                    "pmulld             %%xmm5,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    "psubd              %%xmm7,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+                    "pmulld             %%xmm6,     %%xmm0\n"
+                    "psrad                  $8,     %%xmm0\n"
+
+                    "pcmpeqd            %%xmm1,     %%xmm1\n"
+                    "psrld                 $31,     %%xmm1\n"
+                    "pslld                  $7,     %%xmm1\n"
+                    "paddd              %%xmm1,     %%xmm0\n"
+                    // out_u stays in xmm0
+
+
+                    "movd                 %[y],     %%xmm7\n"
+                    "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+                    "pmulld             %%xmm4,     %%xmm3\n"
+                    "pmulld             %%xmm5,     %%xmm2\n"
+                    "psubd              %%xmm2,     %%xmm3\n"
+                    "psubd              %%xmm7,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+                    "pmulld             %%xmm6,     %%xmm3\n"
+                    "psrad                  $8,     %%xmm3\n"
+
+                    "paddd              %%xmm1,     %%xmm3\n"
+
+                    // out_v stays in xmm3
+                    "pxor               %%xmm7,     %%xmm7\n"
+                :
+                : [x]     "r" (i_x),
+                  [y]     "r" (i_y)
+                : "memory" );
+#endif
+                __asm__ __volatile__(
+                    "pcmpeqd            %%xmm2,     %%xmm2\n" //  
0xFFFFFFFF
+                    "psrld                 $24,     %%xmm2\n" //  
0x000000FF
+                    "pand               %%xmm2,     %%xmm0\n" // mask out  
unnecessary data
+                    "pand               %%xmm2,     %%xmm3\n"
+
+                    "movdqu           (%[out]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm0,     %%xmm2\n"
+                    "movdqu             %%xmm2,   (%[out])\n"
+
+                    "movdqu         (%[out_v]),     %%xmm1\n"
+                    "pcmpeqd            %%xmm2,     %%xmm2\n"
+                    "pslld                  $8,     %%xmm2\n"
+                    "pand               %%xmm1,     %%xmm2\n"
+                    "por                %%xmm3,     %%xmm2\n"
+                    "movdqu             %%xmm2, (%[out_v])\n"
+                :
+                : [out]   "r" (p_out),
+                  [out_v] "r" (p_out_v)
+                : "memory" );
+                p_in += 4;
+                p_in_v += 4;
+                p_out += 4;
+                p_out_v += 4;
+            );
+        }
+
+        p_line_end += 8 * 4;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PACKED_WRITE_UV();
+        }
+
+            p_in += i_pitch - i_visible_pitch;
+            p_in_v += i_pitch - i_visible_pitch;
+            p_out += i_pitch - i_visible_pitch;
+            p_out_v += i_pitch - i_visible_pitch;
+    }
+
+    RESTORE_XMM_REGISTERS;
+
+    return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int  
i_sin, int i_cos,
+                         int i_sat, int i_x, int i_y )
+{
+    uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+    uint8_t *p_out, *p_out_v;
+
+    p_in = p_pic->p[U_PLANE].p_pixels;
+    p_in_v = p_pic->p[V_PLANE].p_pixels;
+    p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+                      * p_pic->p[U_PLANE].i_pitch - 8;
+
+    p_out = p_outpic->p[U_PLANE].p_pixels;
+    p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+    uint8_t i_u, i_v;
+
+    STORE_XMM_REGISTERS;
+
+    __asm__ volatile(
+#if defined(__i386__)
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm3\n"
+        "movd               %[sin],     %%xmm4\n"
+        "pslld                 $16,     %%xmm4\n"
+        "pslld                 $16,     %%xmm3\n"
+        "psrld                 $16,     %%xmm3\n"
+        "pshufd      $0,    %%xmm3,     %%xmm3\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "por                %%xmm4,     %%xmm3\n"
+        "movd               %[sat],     %%xmm4\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "pcmpeqb            %%xmm6,     %%xmm6\n"
+        "psrlw                 $15,     %%xmm6\n"
+        "psllw                  $7,     %%xmm6\n"
+#elif defined(__x86_64__)
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm6\n"
+        "movd               %[sin],     %%xmm7\n"
+        "pslld                 $16,     %%xmm7\n"
+        "pslld                 $16,     %%xmm6\n"
+        "psrld                 $16,     %%xmm6\n"
+        "pshufd      $0,    %%xmm6,     %%xmm6\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+        "por                %%xmm7,     %%xmm6\n"
+        "movd               %[sat],     %%xmm7\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "pshufd      $0,   %%xmm10,    %%xmm10\n"
+        "pshufd      $0,   %%xmm11,    %%xmm11\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+        "pcmpeqb           %%xmm12,    %%xmm12\n"
+        "pcmpeqb            %%xmm9,     %%xmm9\n"
+        "pcmpeqb           %%xmm13,    %%xmm13\n"
+        "psrlw                 $15,     %%xmm9\n"
+        "psllw                  $7,     %%xmm9\n"
+        "psrlw                 $15,    %%xmm13\n"
+        "mov               $0x8000,      %%eax\n"
+        "movd                %%eax,    %%xmm14\n"
+        "pshufd      $0,   %%xmm14,    %%xmm14\n"
+        "mov           $0x80008000,      %%eax\n"
+        "movd                %%eax,    %%xmm15\n"
+        "pshufd      $0,   %%xmm15,    %%xmm15\n"
+#endif
+    :
+    :
+      [x]      "r" ( i_x ),
+      [y]      "r" ( i_y ),
+      [sat]    "r" ( i_sat * 0x10001 ),
+      [sin]    "r" ( i_sin ),
+      [cos]    "r" ( i_cos )
+    : "eax", "memory" );
+    for( ; p_in < p_in_end ; )
+    {
+        p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            /* Do 8 pixels at a time */
+            __asm__ __volatile__ (
+                "movd              (%[in]),     %%xmm1\n"
+                "movd            (%[in_v]),     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm1\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "punpcklwd          %%xmm2,     %%xmm1\n"
+                /////////////////////////////////////////
+                "movd             4(%[in]),     %%xmm2\n"
+                "movd           4(%[in_v]),     %%xmm5\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm2\n"
+            :
+            : [in]    "r" (p_in),
+              [in_v]  "r" (p_in_v)
+            : "memory" );
+#ifdef __i386__
+            __asm__ __volatile__ (
+                "pmaddwd            %%xmm3,     %%xmm1\n"
+                "pmaddwd            %%xmm3,     %%xmm2\n"
+                "movd                 %[x],     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "pslld                  $8,     %%xmm1\n"
+                "psrld                 $16,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "psrld                 $16,     %%xmm2\n"
+                "mov               $0x8000,      %%eax\n"
+                "movd                %%eax,     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+                "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+                "paddw              %%xmm5,     %%xmm1\n"
+                "pmullw             %%xmm4,     %%xmm1\n"
+                "psraw                  $8,     %%xmm1\n"
+                "paddw              %%xmm6,     %%xmm1\n"
+                /* store U channel */
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "movq               %%xmm1,   (%[out])\n"
+            :
+            : [x]     "r" (i_x),
+              [out]   "r" (p_out)
+            : "eax", "memory" );
+            __asm__ __volatile__ (
+                /////////////////////////////////////////
+                "pcmpeqb            %%xmm6,     %%xmm6\n"
+                "movq              (%[in]),     %%xmm5\n"
+                "movd            (%[in_v]),     %%xmm1\n"
+                "movd           4(%[in_v]),     %%xmm2\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "pandn              %%xmm6,     %%xmm5\n"
+                "punpcklbw          %%xmm0,     %%xmm1\n"
+                "punpcklbw          %%xmm0,     %%xmm2\n"
+                "paddw              %%xmm6,     %%xmm5\n"
+                "psrlw                 $15,     %%xmm6\n"
+                "punpcklwd          %%xmm5,     %%xmm1\n"
+                "punpckhqdq         %%xmm5,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm2\n"
+
+                "psllw                  $7,     %%xmm6\n"
+                "pmaddwd            %%xmm3,     %%xmm1\n"
+                "pmaddwd            %%xmm3,     %%xmm2\n"
+                "movd                 %[y],     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "pslld                  $8,     %%xmm1\n"
+                "psrld                 $16,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "psrld                 $16,     %%xmm2\n"
+
+                "mov               $0x8000,      %%eax\n"
+                "movd                %%eax,     %%xmm5\n"
+                "pshufd      $0,    %%xmm5,     %%xmm5\n"
+                "psubd              %%xmm5,     %%xmm2\n"
+                "psubd              %%xmm5,     %%xmm1\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+                "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+                "paddw              %%xmm5,     %%xmm1\n"
+                "pmullw             %%xmm4,     %%xmm1\n"
+                "psraw                  $8,     %%xmm1\n"
+                "paddw              %%xmm6,     %%xmm1\n"
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "movq               %%xmm1,  (%[out_v])\n"
+            :
+            : [in]    "r" (p_in),
+              [in_v]  "r" (p_in_v),
+              [y]     "r" (i_y),
+              [out_v] "r" (p_out_v)
+            : "eax", "memory" );
+#elif defined(__x86_64__)
+            __asm__ __volatile__ (
+                /////////////////////////////////////////
+                "movq              (%[in]),     %%xmm5\n"
+                "movd            (%[in_v]),     %%xmm3\n"
+                "movd           4(%[in_v]),     %%xmm4\n"
+                "punpcklbw          %%xmm0,     %%xmm5\n"
+                "pandn             %%xmm12,     %%xmm5\n" // invert U (to  
be subtracted)
+                "punpcklbw          %%xmm0,     %%xmm3\n"
+                "paddw             %%xmm13,     %%xmm5\n" // add 1
+                "punpcklbw          %%xmm0,     %%xmm4\n"
+                "punpcklwd          %%xmm5,     %%xmm3\n"
+                "punpckhqdq         %%xmm5,     %%xmm5\n"
+                "punpcklwd          %%xmm5,     %%xmm4\n"
+                /////////////////////////////////////////
+                "pmaddwd            %%xmm6,     %%xmm1\n"
+                "pmaddwd            %%xmm6,     %%xmm2\n"
+                "pmaddwd            %%xmm6,     %%xmm3\n"
+                "pmaddwd            %%xmm6,     %%xmm4\n"
+                "psubd             %%xmm10,     %%xmm1\n"
+                "psubd             %%xmm10,     %%xmm2\n"
+                "psubd             %%xmm11,     %%xmm3\n"
+                "psubd             %%xmm11,     %%xmm4\n"
+                "pslld                  $8,     %%xmm1\n"
+                "pslld                  $8,     %%xmm2\n"
+                "pslld                  $8,     %%xmm3\n"
+                "pslld                  $8,     %%xmm4\n"
+                "psrld                 $16,     %%xmm1\n"
+                "psrld                 $16,     %%xmm2\n"
+                "psrld                 $16,     %%xmm3\n"
+                "psrld                 $16,     %%xmm4\n"
+                "psubd             %%xmm14,     %%xmm1\n"
+                "psubd             %%xmm14,     %%xmm2\n"
+                "psubd             %%xmm14,     %%xmm3\n"
+                "psubd             %%xmm14,     %%xmm4\n"
+                "packssdw           %%xmm2,     %%xmm1\n"
+                "packssdw           %%xmm4,     %%xmm3\n"
+                "paddw             %%xmm15,     %%xmm1\n"
+                "paddw             %%xmm15,     %%xmm3\n"
+                "pmullw             %%xmm7,     %%xmm1\n"
+                "pmullw             %%xmm7,     %%xmm3\n"
+                "psraw                  $8,     %%xmm1\n"
+                "psraw                  $8,     %%xmm3\n"
+                "paddw              %%xmm9,     %%xmm1\n"
+                "paddw              %%xmm9,     %%xmm3\n"
+                "packuswb           %%xmm0,     %%xmm1\n"
+                "packuswb           %%xmm0,     %%xmm3\n"
+                "movq               %%xmm1,   (%[out])\n"
+                "movq               %%xmm3, (%[out_v])\n"
+            :
+            : [in]    "r" (p_in),
+              [in_v]  "r" (p_in_v),
+              [out]   "r" (p_out),
+              [out_v] "r" (p_out_v)
+            : "eax", "memory" );
+#endif
+            p_in += 8;
+            p_in_v += 8;
+            p_out += 8;
+            p_out_v += 8;
+        }
+
+        p_line_end += 8;
+
+        for( ; p_in < p_line_end ; )
+        {
+            PLANAR_WRITE_UV();
+        }
+
+        p_in += p_pic->p[U_PLANE].i_pitch
+                - p_pic->p[U_PLANE].i_visible_pitch;
+        p_in_v += p_pic->p[V_PLANE].i_pitch
+                - p_pic->p[V_PLANE].i_visible_pitch;
+        p_out += p_outpic->p[U_PLANE].i_pitch
+                - p_outpic->p[U_PLANE].i_visible_pitch;
+        p_out_v += p_outpic->p[V_PLANE].i_pitch
+                    - p_outpic->p[V_PLANE].i_visible_pitch;
+    }
+
+    RESTORE_XMM_REGISTERS;
+
+    return VLC_SUCCESS;
+}
+#endif
+
  int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic, int  
i_sin, int i_cos,
                           int i_sat, int i_x, int i_y )
  {
diff --git a/modules/video_filter/adjust_sat_hue.h  
b/modules/video_filter/adjust_sat_hue.h
index d850dec..cef13ec 100644
--- a/modules/video_filter/adjust_sat_hue.h
+++ b/modules/video_filter/adjust_sat_hue.h
@@ -39,6 +39,40 @@
   */

  /**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                               int i_sin, int i_cos, int i_sat, int i_x,
+                               int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+                               int i_sin, int i_cos, int i_sat, int i_x,
+                               int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat <= 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int  
i_sin,
+                          int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int  
i_sin,
+                          int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
   * Basic C compiler generated function for planar format, i_sat > 256
   */
  int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic,
-- 
1.7.1