[vlc-devel] [PATCH] Do adjust filter in SSE2 and SSE4.1

Fri Jul 15 21:08:18 CEST 2011

From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>

Added whole header file (modules/video_filter/adjust_asm.h) with inline assembly functions to be used in the filter
---
 modules/video_filter/adjust.c     |  190 ++++++++++
 modules/video_filter/adjust_asm.h |  731 +++++++++++++++++++++++++++++++++++++
 2 files changed, 921 insertions(+), 0 deletions(-)
 create mode 100644 modules/video_filter/adjust_asm.h

diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index db5e2be..0aa261d 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -39,6 +39,9 @@
 #include <vlc_filter.h>
 #include "filter_picture.h"
 
+#include <vlc_cpu.h>
+#include "adjust_asm.h"
+
 #ifndef M_PI
 #   define M_PI 3.14159265358979323846
 #endif
@@ -335,7 +338,102 @@ static picture_t *FilterPlanar( filter_t *p_filter, picture_t *p_pic )
 
     i_x = ( cos(f_hue) + sin(f_hue) ) * 32768;
     i_y = ( cos(f_hue) - sin(f_hue) ) * 32768;
+#if defined(CAN_COMPILE_SSE4_1)
+    if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat > 256 )
+    {
+#define WRITE_UV_CLIP() \
+    i_u = *p_in++ ; i_v = *p_in_v++ ; \
+    *p_out++ = clip_uint8_vlc( (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+                           * i_sat) >> 8) + 128); \
+    *p_out_v++ = clip_uint8_vlc( (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+                           * i_sat) >> 8) + 128)
+
+        uint8_t i_u, i_v;
+
+        int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+        WRITE_UV_CLIP_PLANAR_PREPARE;
+
+        for( ; p_in < p_in_end ; )
+        {
+            p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+            for( ; p_in < p_line_end ; )
+            {
+                /* Do 8 pixels at a time */
+                WRITE_UV_CLIP_PLANAR_SSE4_1();
+              //  WRITE_UV_CLIP_PLANAR_SSE4_1();
+            }
 
+            p_line_end += 8;
+
+            for( ; p_in < p_line_end ; )
+            {
+                WRITE_UV_CLIP();
+            }
+
+            p_in += p_pic->p[U_PLANE].i_pitch
+                  - p_pic->p[U_PLANE].i_visible_pitch;
+            p_in_v += p_pic->p[V_PLANE].i_pitch
+                    - p_pic->p[V_PLANE].i_visible_pitch;
+            p_out += p_outpic->p[U_PLANE].i_pitch
+                   - p_outpic->p[U_PLANE].i_visible_pitch;
+            p_out_v += p_outpic->p[V_PLANE].i_pitch
+                     - p_outpic->p[V_PLANE].i_visible_pitch;
+        }
+
+        WRITE_UV_SSE_FINISH;
+#undef WRITE_UV_CLIP
+    }
+    else
+#elif defined(CAN_COMPILE_SSE2)
+    if ( vlc_CPU() & CPU_CAPABILITY_SSE2 && i_sat <= 256 )
+    {
+#define WRITE_UV() \
+    i_u = *p_in++ ; i_v = *p_in_v++ ; \
+    *p_out++ = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+                       * i_sat) >> 8) + 128; \
+    *p_out_v++ = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+                       * i_sat) >> 8) + 128
+
+        uint8_t i_u, i_v;
+
+        int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+        WRITE_UV_PLANAR_PREPARE;
+
+        for( ; p_in < p_in_end ; )
+        {
+            p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+            for( ; p_in < p_line_end ; )
+            {
+                /* Do 8 pixels at a time */
+                WRITE_UV_PLANAR_SSE2();
+            }
+
+            p_line_end += 8;
+
+            for( ; p_in < p_line_end ; )
+            {
+                WRITE_UV();
+            }
+
+            p_in += p_pic->p[U_PLANE].i_pitch
+                  - p_pic->p[U_PLANE].i_visible_pitch;
+            p_in_v += p_pic->p[V_PLANE].i_pitch
+                    - p_pic->p[V_PLANE].i_visible_pitch;
+            p_out += p_outpic->p[U_PLANE].i_pitch
+                   - p_outpic->p[U_PLANE].i_visible_pitch;
+            p_out_v += p_outpic->p[V_PLANE].i_pitch
+                     - p_outpic->p[V_PLANE].i_visible_pitch;
+        }
+
+        WRITE_UV_SSE_FINISH;
+#undef WRITE_UV
+    }
+    else
+#endif
     if ( i_sat > 256 )
     {
 #define WRITE_UV_CLIP() \
@@ -574,6 +672,96 @@ static picture_t *FilterPacked( filter_t *p_filter, picture_t *p_pic )
     i_x = ( cos(f_hue) + sin(f_hue) ) * 32768;
     i_y = ( cos(f_hue) - sin(f_hue) ) * 32768;
 
+#if defined(CAN_COMPILE_SSE4_1)
+    if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat > 256 )
+    {
+#define WRITE_UV_CLIP() \
+    i_u = *p_in; p_in += 4; i_v = *p_in_v; p_in_v += 4; \
+    *p_out = clip_uint8_vlc( (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+                           * i_sat) >> 8) + 128); \
+    p_out += 4; \
+    *p_out_v = clip_uint8_vlc( (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+                           * i_sat) >> 8) + 128); \
+    p_out_v += 4
+
+        uint8_t i_u, i_v;
+
+        WRITE_UV_CLIP_PACKED_PREPARE;
+
+        for( ; p_in < p_in_end ; )
+        {
+            p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+            for( ; p_in < p_line_end ; )
+            {
+                /* Do 8 pixels at a time */
+                WRITE_UV_CLIP_PACKED_SSE4_1();
+            }
+
+            p_line_end += 8 * 4;
+
+            for( ; p_in < p_line_end ; )
+            {
+                WRITE_UV_CLIP();
+            }
+
+            p_in += i_pitch - i_visible_pitch;
+            p_in_v += i_pitch - i_visible_pitch;
+            p_out += i_pitch - i_visible_pitch;
+            p_out_v += i_pitch - i_visible_pitch;
+        }
+
+        WRITE_UV_SSE_FINISH;
+#undef WRITE_UV_CLIP
+    }
+    else
+#elif defined(CAN_COMPILE_SSE4_1)
+    if ( vlc_CPU() & CPU_CAPABILITY_SSE4_1 && i_sat <= 256 )
+    {
+#define WRITE_UV() \
+    i_u = *p_in; p_in += 4; i_v = *p_in_v; p_in_v += 4; \
+    *p_out = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) \
+                       * i_sat) >> 8) + 128; \
+    p_out += 4; \
+    *p_out_v = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) \
+                       * i_sat) >> 8) + 128; \
+    p_out_v += 4
+
+        uint8_t i_u, i_v;
+
+        int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
+        WRITE_UV_PACKED_PREPARE;
+
+        for( ; p_in < p_in_end ; )
+        {
+            p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+            for( ; p_in < p_line_end ; )
+            {
+                /* Do 8 pixels at a time */
+                WRITE_UV_PACKED_SSE4_1();
+                WRITE_UV_PACKED_SSE4_1();
+            }
+
+            p_line_end += 8 * 4;
+
+            for( ; p_in < p_line_end ; )
+            {
+                WRITE_UV();
+            }
+
+            p_in += i_pitch - i_visible_pitch;
+            p_in_v += i_pitch - i_visible_pitch;
+            p_out += i_pitch - i_visible_pitch;
+            p_out_v += i_pitch - i_visible_pitch;
+        }
+
+        WRITE_UV_SSE_FINISH;
+#undef WRITE_UV
+    }
+    else
+#endif
     if ( i_sat > 256 )
     {
 #define WRITE_UV_CLIP() \
@@ -587,6 +775,8 @@ static picture_t *FilterPacked( filter_t *p_filter, picture_t *p_pic )
 
         uint8_t i_u, i_v;
 
+        int i_sat_sub = 0x10001 * i_sat; // spread to doublewords
+
         for( ; p_in < p_in_end ; )
         {
             p_line_end = p_in + i_visible_pitch - 8 * 4;
diff --git a/modules/video_filter/adjust_asm.h b/modules/video_filter/adjust_asm.h
new file mode 100644
index 0000000..942592d
--- /dev/null
+++ b/modules/video_filter/adjust_asm.h
@@ -0,0 +1,731 @@
+/*****************************************************************************
+ * adjust_asm.h: SSE extension to adjust filter
+ *****************************************************************************
+ * Copyright (C) 2000-2011 the VideoLAN team
+ *
+ * Authors: Martin Briza <gamajun at seznam.cz>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+/*****************************************************************************
+ * Macros for storing XMM registers to a variable, so it won't break anything in
+ * the function where they are used
+ * - work in pairs, but are wrapped in PREPARE and FINISH macros
+ *****************************************************************************/
+#ifdef __x86_64__
+#define STORE_XMM_REGISTERS \
+    static uint64_t xmm_temporary_storage[32]; \
+    __asm__ volatile( \
+        "movdqa              %%xmm0,    (%[x])\n" \
+        "movdqa              %%xmm1,  16(%[x])\n" \
+        "movdqa              %%xmm2,  32(%[x])\n" \
+        "movdqa              %%xmm3,  48(%[x])\n" \
+        "movdqa              %%xmm4,  64(%[x])\n" \
+        "movdqa              %%xmm5,  80(%[x])\n" \
+        "movdqa              %%xmm6,  96(%[x])\n" \
+        "movdqa              %%xmm7, 112(%[x])\n" \
+        "movdqa              %%xmm8, 128(%[x])\n" \
+        "movdqa              %%xmm9, 144(%[x])\n" \
+        "movdqa             %%xmm10, 160(%[x])\n" \
+        "movdqa             %%xmm11, 176(%[x])\n" \
+        "movdqa             %%xmm12, 192(%[x])\n" \
+        "movdqa             %%xmm13, 208(%[x])\n" \
+        "movdqa             %%xmm14, 224(%[x])\n" \
+        "movdqa             %%xmm15, 240(%[x])\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#elif defined(__i386__)
+#define STORE_XMM_REGISTERS \
+    static uint64_t xmm_temporary_storage[16]; \
+    __asm__ volatile( \
+        "movdqa              %%xmm0,    (%[x])\n" \
+        "movdqa              %%xmm1,  16(%[x])\n" \
+        "movdqa              %%xmm2,  32(%[x])\n" \
+        "movdqa              %%xmm3,  48(%[x])\n" \
+        "movdqa              %%xmm4,  64(%[x])\n" \
+        "movdqa              %%xmm5,  80(%[x])\n" \
+        "movdqa              %%xmm6,  96(%[x])\n" \
+        "movdqa              %%xmm7, 112(%[x])\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#endif
+
+#ifdef __x86_64__
+#define RESTORE_XMM_REGISTERS \
+    __asm__ volatile( \
+        "movdqa              (%[x]),    %%xmm0\n" \
+        "movdqa            16(%[x]),    %%xmm1\n" \
+        "movdqa            32(%[x]),    %%xmm2\n" \
+        "movdqa            48(%[x]),    %%xmm3\n" \
+        "movdqa            64(%[x]),    %%xmm4\n" \
+        "movdqa            80(%[x]),    %%xmm5\n" \
+        "movdqa            96(%[x]),    %%xmm6\n" \
+        "movdqa           112(%[x]),    %%xmm7\n" \
+        "movdqa           128(%[x]),    %%xmm8\n" \
+        "movdqa           144(%[x]),    %%xmm9\n" \
+        "movdqa           160(%[x]),   %%xmm10\n" \
+        "movdqa           176(%[x]),   %%xmm11\n" \
+        "movdqa           192(%[x]),   %%xmm12\n" \
+        "movdqa           208(%[x]),   %%xmm13\n" \
+        "movdqa           224(%[x]),   %%xmm14\n" \
+        "movdqa           240(%[x]),   %%xmm15\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#elif defined(__i386__)
+#define RESTORE_XMM_REGISTERS \
+    __asm__ volatile( \
+        "movdqa              (%[x]),    %%xmm0\n" \
+        "movdqa            16(%[x]),    %%xmm1\n" \
+        "movdqa            32(%[x]),    %%xmm2\n" \
+        "movdqa            48(%[x]),    %%xmm3\n" \
+        "movdqa            64(%[x]),    %%xmm4\n" \
+        "movdqa            80(%[x]),    %%xmm5\n" \
+        "movdqa            96(%[x]),    %%xmm6\n" \
+        "movdqa           112(%[x]),    %%xmm7\n" \
+    : \
+    : [x] "r" (xmm_temporary_storage) \
+    : "memory" \
+    )
+#endif
+
+/*****************************************************************************
+ * Load 4 bytes of U and V data in PLANAR format in xmm0 and xmm1 in this order
+ *****************************************************************************/
+static inline void write_uv_load_planaryuv_sse2( uint8_t *p_in, uint8_t *p_in_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "movd              (%[in]),     %%xmm0\n"
+        "movd            (%[in_v]),     %%xmm1\n"
+        "punpcklbw          %%xmm7,     %%xmm0\n"
+        "punpcklwd          %%xmm7,     %%xmm0\n"
+        "punpcklbw          %%xmm7,     %%xmm1\n"
+        "punpcklwd          %%xmm7,     %%xmm1\n"
+    :
+    : [in]    "r" (p_in),
+      [in_v]  "r" (p_in_v)
+    : "memory" );
+#endif
+}
+
+
+static inline void write_uv_load_planaryuv_first(uint8_t *p_in, uint8_t *p_in_v)
+{
+    __asm__ volatile (
+#if defined(CAN_COMPILE_SSE2)
+        "movd              (%[in]),     %%xmm1\n"
+        "movd            (%[in_v]),     %%xmm2\n"
+        "punpcklbw          %%xmm0,     %%xmm1\n"
+        "punpcklbw          %%xmm0,     %%xmm2\n"
+        "punpcklwd          %%xmm2,     %%xmm1\n"
+        /////////////////////////////////////////
+        "movd             4(%[in]),     %%xmm2\n"
+        "movd           4(%[in_v]),     %%xmm5\n"
+        "punpcklbw          %%xmm0,     %%xmm2\n"
+        "punpcklbw          %%xmm0,     %%xmm5\n"
+        "punpcklwd          %%xmm5,     %%xmm2\n"
+#if defined(__x86_64__)
+        /////////////////////////////////////////
+        "movq              (%[in]),     %%xmm5\n"
+        "movd            (%[in_v]),     %%xmm3\n"
+        "movd           4(%[in_v]),     %%xmm4\n"
+        "punpcklbw          %%xmm0,     %%xmm5\n"
+        "pandn             %%xmm12,     %%xmm5\n" // invert U (to be subtracted)
+        "punpcklbw          %%xmm0,     %%xmm3\n"
+        "paddw             %%xmm13,     %%xmm5\n" // add 1
+        "punpcklbw          %%xmm0,     %%xmm4\n"
+        "punpcklwd          %%xmm5,     %%xmm3\n"
+        "punpckhqdq         %%xmm5,     %%xmm5\n"
+        "punpcklwd          %%xmm5,     %%xmm4\n"
+#endif
+#endif
+    :
+    : [in]     "r" ( p_in ),
+      [in_v]   "r" ( p_in_v )
+    : "eax", "memory" );
+}
+
+#if defined(__i386__)
+static inline void write_uv_load_planaryuv_second(uint8_t *p_in, uint8_t *p_in_v)
+{
+    __asm__ volatile (
+        "pcmpeqb            %%xmm6,     %%xmm6\n"
+        "movq              (%[in]),     %%xmm5\n"
+        "movd            (%[in_v]),     %%xmm1\n"
+        "movd           4(%[in_v]),     %%xmm2\n"
+        "punpcklbw          %%xmm0,     %%xmm5\n"
+        "pandn              %%xmm6,     %%xmm5\n"
+        "punpcklbw          %%xmm0,     %%xmm1\n"
+        "punpcklbw          %%xmm0,     %%xmm2\n"
+        "paddw              %%xmm6,     %%xmm5\n"
+        "psrlw                 $15,     %%xmm6\n"
+        "punpcklwd          %%xmm5,     %%xmm1\n"
+        "punpckhqdq         %%xmm5,     %%xmm5\n"
+        "punpcklwd          %%xmm5,     %%xmm2\n"
+
+        "psllw                  $7,     %%xmm6\n"
+    :
+    : [in]     "r" ( p_in ),
+      [in_v]   "r" ( p_in_v )
+    : "eax", "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * "Unclip" results stored in XMM0 and XMM3
+ * - leave only low 8 bits of dword content of XMM0 and XMM3
+ * - if not used on planar data, packing instructions will saturate the results
+ *****************************************************************************/
+static inline void write_uv_unclip_planaryuv_sse2 ()
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "pslld                 $24,     %%xmm0\n" // 0xXXXXXXXX ->
+        "psrld                 $24,     %%xmm0\n" // -> 0x000000XX
+        "pslld                 $24,     %%xmm3\n"
+        "psrld                 $24,     %%xmm3\n"
+    :: );
+#endif
+}
+
+/*****************************************************************************
+ * Pack and write PlanarYUV 4B content of XMM0 in p_out and XMM3 in p_out_v
+ *****************************************************************************/
+static inline void write_uv_store_planaryuv_sse2( uint8_t *p_out, uint8_t *p_out_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "packusdw           %%xmm7,     %%xmm0\n" // pack and saturate (if there
+        "packuswb           %%xmm7,     %%xmm0\n" // is something to saturate)
+        "movd               %%xmm0,   (%[out])\n" // and store in destination
+        "packusdw           %%xmm7,     %%xmm3\n"
+        "packuswb           %%xmm7,     %%xmm3\n"
+        "movd               %%xmm3, (%[out_v])\n"
+    :
+    : [out]   "r" (p_out),
+      [out_v] "r" (p_out_v)
+    : "memory" );
+#endif
+}
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_first( uint8_t *p_out )
+{
+    __asm__ volatile(
+        "packuswb           %%xmm0,     %%xmm1\n"
+        "movq               %%xmm1,   (%[out])\n"
+    :
+    : [out] "r" (p_out)
+    : "memory" );
+}
+#elif defined(__x86_64__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_first( uint8_t *p_out,
+                                                   uint8_t *p_out_v )
+{
+    __asm__ volatile(
+        "packuswb           %%xmm0,     %%xmm1\n"
+        "packuswb           %%xmm0,     %%xmm3\n"
+        "movq               %%xmm1,   (%[out])\n"
+        "movq               %%xmm3, (%[out_v])\n"
+    :
+    : [out]   "r" (p_out),
+      [out_v] "r" (p_out_v)
+    : "memory" );
+}
+#endif
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_store_planaryuv_second( uint8_t *p_out_v )
+{
+    __asm__ volatile(
+        "packuswb           %%xmm0,     %%xmm1\n"
+        "movq               %%xmm1,  (%[out_v])\n"
+    :
+    : [out_v] "r" (p_out_v)
+    : "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * Run the filter on a Planar YUV picture.
+ *****************************************************************************/
+static inline void write_uv_load_packedyuv_sse2( uint8_t *p_in, uint8_t *p_in_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "movdqu            (%[in]),     %%xmm0\n"
+        "movdqu          (%[in_v]),     %%xmm1\n"
+        "pcmpeqd            %%xmm2,     %%xmm2\n" // 0xFFFFFFFF
+        "psrld                 $24,     %%xmm2\n" // 0x000000FF
+        "pand               %%xmm2,     %%xmm0\n" // mask out unnecessary data
+        "pand               %%xmm2,     %%xmm1\n"
+    :
+    : [in]   "r" (p_in),
+      [in_v] "r" (p_in_v)
+    : "memory");
+#endif
+}
+
+/*****************************************************************************
+ * Clip (== saturate) content of XMM0 and XMM3,
+ *****************************************************************************/
+static inline void write_uv_clip_packedyuv_sse2()
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "packusdw           %%xmm7,     %%xmm0\n"
+        "packuswb           %%xmm7,     %%xmm0\n"
+        "punpcklbw          %%xmm7,     %%xmm0\n"
+        "punpcklwd          %%xmm7,     %%xmm0\n"
+        "packusdw           %%xmm7,     %%xmm3\n"
+        "packuswb           %%xmm7,     %%xmm3\n"
+        "punpcklbw          %%xmm7,     %%xmm3\n"
+        "punpcklwd          %%xmm7,     %%xmm3\n"
+    ::);
+#endif
+}
+
+/*****************************************************************************
+ * Remove upper 24b of every dword number stored in XMM0 and XMM3
+ *****************************************************************************/
+static inline void write_uv_clear_packedyuv_sse2()
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+        "pcmpeqd            %%xmm2,     %%xmm2\n" // 0xFFFFFFFF
+        "psrld                 $24,     %%xmm2\n" // 0x000000FF
+        "pand               %%xmm2,     %%xmm0\n" // mask out unnecessary data
+        "pand               %%xmm2,     %%xmm3\n"
+    ::);
+#endif
+}
+
+/*****************************************************************************
+ * Store packed U from XMM0 and packed V from XMM3 registers in p_out and
+ * p_out_v respectively
+ * - must have used write_uv_clip_packedyuv_sse2()
+ *               or write_uv_clear_packedyuv_sse2()
+ *****************************************************************************/
+static inline void write_uv_store_packedyuv_sse2( uint8_t *p_out, uint8_t *p_out_v )
+{
+#if defined(CAN_COMPILE_SSE2)
+    __asm__ volatile(
+
+        "movdqu           (%[out]),     %%xmm1\n"
+        "pcmpeqd            %%xmm2,     %%xmm2\n"
+        "pslld                  $8,     %%xmm2\n"
+        "pand               %%xmm1,     %%xmm2\n"
+        "por                %%xmm0,     %%xmm2\n"
+        "movdqu             %%xmm2,   (%[out])\n"
+
+        "movdqu         (%[out_v]),     %%xmm1\n"
+        "pcmpeqd            %%xmm2,     %%xmm2\n"
+        "pslld                  $8,     %%xmm2\n"
+        "pand               %%xmm1,     %%xmm2\n"
+        "por                %%xmm3,     %%xmm2\n"
+        "movdqu             %%xmm2, (%[out_v])\n"
+    :
+    : [out]   "r" (p_out),
+      [out_v] "r" (p_out_v)
+    : "memory" );
+#endif
+}
+
+/*****************************************************************************
+ * Basic write_uv function, used in both packed and planar formats
+ * - considers U stored in XMM0 and V in XMM3
+ * - result is stored in XMM0 (U) and XMM3 afterwards
+ * - uses ALL 8 basic (in 32bit mode) XMM registers, clearing XMM7 to be used
+ *   in packing and unpacking data
+ * p_out   = (( ((i_u * i_cos + i_v * i_sin - i_x) >> 8) * i_sat) >> 8) + 128
+ * p_out_v = (( ((i_v * i_cos - i_u * i_sin - i_y) >> 8) * i_sat) >> 8) + 128
+ *****************************************************************************/
+static inline void write_uv_base_sse41( int i_x, int i_y, int i_cos,
+                                            int i_sin, int i_sat )
+{
+#if defined(CAN_COMPILE_SSE4_1)
+#if defined(__x86_64__)
+    __asm__ volatile(
+
+        "movdqu             %%xmm0,     %%xmm2\n"
+        "movdqu             %%xmm1,     %%xmm3\n"
+
+        "pmulld             %%xmm8,     %%xmm0\n"
+        "pmulld             %%xmm9,     %%xmm1\n"
+        "paddd              %%xmm1,     %%xmm0\n"
+        "psubd             %%xmm10,     %%xmm0\n"
+        "psrad                  $8,     %%xmm0\n"
+        "pmulld            %%xmm12,     %%xmm0\n"
+        "psrad                  $8,     %%xmm0\n"
+
+        "pcmpeqd            %%xmm1,     %%xmm1\n"
+        "psrld                 $31,     %%xmm1\n"
+        "pslld                  $7,     %%xmm1\n"
+        "paddd              %%xmm1,     %%xmm0\n"
+        // out_u stays in xmm0
+
+
+        "pmulld             %%xmm8,     %%xmm3\n"
+        "pmulld             %%xmm9,     %%xmm2\n"
+        "psubd              %%xmm2,     %%xmm3\n"
+        "psubd             %%xmm11,     %%xmm3\n"
+        "psrad                  $8,     %%xmm3\n"
+        "pmulld            %%xmm12,     %%xmm3\n"
+        "psrad                  $8,     %%xmm3\n"
+
+        "paddd              %%xmm1,     %%xmm3\n"
+
+        // out_v stays in xmm3
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+
+        "movdqu             %%xmm0,     %%xmm2\n"
+        "movdqu             %%xmm1,     %%xmm3\n"
+
+        "movd                 %[x],     %%xmm7\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+        "pmulld             %%xmm4,     %%xmm0\n"
+        "pmulld             %%xmm5,     %%xmm1\n"
+        "paddd              %%xmm1,     %%xmm0\n"
+        "psubd              %%xmm7,     %%xmm0\n"
+        "psrad                  $8,     %%xmm0\n"
+        "pmulld             %%xmm6,     %%xmm0\n"
+        "psrad                  $8,     %%xmm0\n"
+
+        "pcmpeqd            %%xmm1,     %%xmm1\n"
+        "psrld                 $31,     %%xmm1\n"
+        "pslld                  $7,     %%xmm1\n"
+        "paddd              %%xmm1,     %%xmm0\n"
+        // out_u stays in xmm0
+
+
+        "movd                 %[y],     %%xmm7\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+        "pmulld             %%xmm4,     %%xmm3\n"
+        "pmulld             %%xmm5,     %%xmm2\n"
+        "psubd              %%xmm2,     %%xmm3\n"
+        "psubd              %%xmm7,     %%xmm3\n"
+        "psrad                  $8,     %%xmm3\n"
+        "pmulld             %%xmm6,     %%xmm3\n"
+        "psrad                  $8,     %%xmm3\n"
+
+        "paddd              %%xmm1,     %%xmm3\n"
+
+        // out_v stays in xmm3
+        "pxor               %%xmm7,     %%xmm7\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+#endif
+}
+
+#if defined(__i386__)
+static inline void write_uv_base_first( int i_x )
+{
+    __asm__ volatile(
+        ////////////////////////////////////////////
+        "pmaddwd            %%xmm3,     %%xmm1\n"
+        "pmaddwd            %%xmm3,     %%xmm2\n"
+        "movd                 %[x],     %%xmm5\n"
+        "pshufd      $0,    %%xmm5,     %%xmm5\n"
+        "psubd              %%xmm5,     %%xmm1\n"
+        "psubd              %%xmm5,     %%xmm2\n"
+        "pslld                  $8,     %%xmm1\n"
+        "psrld                 $16,     %%xmm1\n"
+        "pslld                  $8,     %%xmm2\n"
+        "psrld                 $16,     %%xmm2\n"
+        "mov               $0x8000,      %%eax\n"
+        "movd                %%eax,     %%xmm5\n"
+        "pshufd      $0,    %%xmm5,     %%xmm5\n"
+        "psubd              %%xmm5,     %%xmm2\n"
+        "psubd              %%xmm5,     %%xmm1\n"
+        "packssdw           %%xmm2,     %%xmm1\n"
+        "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+        "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+        "paddw              %%xmm5,     %%xmm1\n"
+        "pmullw             %%xmm4,     %%xmm1\n"
+        "psraw                  $8,     %%xmm1\n"
+        "paddw              %%xmm6,     %%xmm1\n"
+    :
+    : [x]   "r" (i_x)
+    : "memory" );
+}
+#elif defined(__x86_64__)
+static inline void write_uv_base_first( )
+{
+    __asm__ volatile(
+        "pmaddwd            %%xmm6,     %%xmm1\n"
+        "pmaddwd            %%xmm6,     %%xmm2\n"
+        "pmaddwd            %%xmm6,     %%xmm3\n"
+        "pmaddwd            %%xmm6,     %%xmm4\n"
+        "psubd             %%xmm10,     %%xmm1\n"
+        "psubd             %%xmm10,     %%xmm2\n"
+        "psubd             %%xmm11,     %%xmm3\n"
+        "psubd             %%xmm11,     %%xmm4\n"
+        "pslld                  $8,     %%xmm1\n"
+        "pslld                  $8,     %%xmm2\n"
+        "pslld                  $8,     %%xmm3\n"
+        "pslld                  $8,     %%xmm4\n"
+        "psrld                 $16,     %%xmm1\n"
+        "psrld                 $16,     %%xmm2\n"
+        "psrld                 $16,     %%xmm3\n"
+        "psrld                 $16,     %%xmm4\n"
+        "psubd             %%xmm14,     %%xmm1\n"
+        "psubd             %%xmm14,     %%xmm2\n"
+        "psubd             %%xmm14,     %%xmm3\n"
+        "psubd             %%xmm14,     %%xmm4\n"
+        "packssdw           %%xmm2,     %%xmm1\n"
+        "packssdw           %%xmm4,     %%xmm3\n"
+        "paddw             %%xmm15,     %%xmm1\n"
+        "paddw             %%xmm15,     %%xmm3\n"
+        "pmullw             %%xmm7,     %%xmm1\n"
+        "pmullw             %%xmm7,     %%xmm3\n"
+        "psraw                  $8,     %%xmm1\n"
+        "psraw                  $8,     %%xmm3\n"
+        "paddw              %%xmm9,     %%xmm1\n"
+        "paddw              %%xmm9,     %%xmm3\n"
+    :
+    :
+    : "memory" );
+}
+#endif
+
+#if defined(__i386__) && defined(CAN_COMPILE_SSE2)
+static inline void write_uv_base_second( int i_y )
+{
+    __asm__ volatile(
+        "pmaddwd            %%xmm3,     %%xmm1\n"
+        "pmaddwd            %%xmm3,     %%xmm2\n"
+        "movd                 %[y],     %%xmm5\n"
+        "pshufd      $0,    %%xmm5,     %%xmm5\n"
+        "psubd              %%xmm5,     %%xmm1\n"
+        "psubd              %%xmm5,     %%xmm2\n"
+        "pslld                  $8,     %%xmm1\n"
+        "psrld                 $16,     %%xmm1\n"
+        "pslld                  $8,     %%xmm2\n"
+        "psrld                 $16,     %%xmm2\n"
+
+        "mov               $0x8000,      %%eax\n"
+        "movd                %%eax,     %%xmm5\n"
+        "pshufd      $0,    %%xmm5,     %%xmm5\n"
+        "psubd              %%xmm5,     %%xmm2\n"
+        "psubd              %%xmm5,     %%xmm1\n"
+        "packssdw           %%xmm2,     %%xmm1\n"
+        "pshuflw     $0,    %%xmm5,     %%xmm5\n"
+        "pshufhw     $0,    %%xmm5,     %%xmm5\n"
+        "paddw              %%xmm5,     %%xmm1\n"
+        "pmullw             %%xmm4,     %%xmm1\n"
+        "psraw                  $8,     %%xmm1\n"
+        "paddw              %%xmm6,     %%xmm1\n"
+    :
+    : [y] "r" (i_y)
+    : "memory" );
+}
+#endif
+
+/*****************************************************************************
+ * NEEDED to be called before used in adjust filter.
+ * Clear XMM7 register and store values used in every computation.
+ * WARNING! On 32bit systems, it will take XMM4, XMM5 and XMM6 registers for
+ *          constants, on 64bit systems XMM8-XMM12 are filled
+ *****************************************************************************/
+static inline void write_uv_prepare_sse2( int i_x, int i_y, int i_cos, int i_sin,
+                                 int i_sat )
+{
+#if defined(CAN_COMPILE_SSE2)
+#if defined(__x86_64__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm8\n"
+        "movd               %[sin],     %%xmm9\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "movd               %[sat],    %%xmm12\n"
+        "pshufd     $0,     %%xmm8,     %%xmm8\n"
+        "pshufd     $0,     %%xmm9,     %%xmm9\n"
+        "pshufd     $0,    %%xmm10,    %%xmm10\n"
+        "pshufd     $0,    %%xmm11,    %%xmm11\n"
+        "pshufd     $0,    %%xmm12,    %%xmm12\n"
+    :
+    : [x]     "r" (i_x),
+      [y]     "r" (i_y),
+      [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#elif defined (__i386__)
+    __asm__ volatile(
+        "pxor               %%xmm7,     %%xmm7\n"
+        "movd               %[cos],     %%xmm4\n"
+        "movd               %[sin],     %%xmm5\n"
+        "movd               %[sat],     %%xmm6\n"
+        "pshufd     $0,     %%xmm4,     %%xmm4\n"
+        "pshufd     $0,     %%xmm5,     %%xmm5\n"
+        "pshufd     $0,     %%xmm6,     %%xmm6\n"
+    :
+    : [cos]   "r" (i_cos),
+      [sin]   "r" (i_sin),
+      [sat]   "r" (i_sat)
+    : "eax", "memory" );
+#endif
+#endif
+}
+
+static inline void write_uv_prepare_planar( int i_x, int i_y, int i_cos,
+                                            int i_sin, int i_sat_sub )
+{
+    __asm__ volatile(
+#if !defined(__x86_64__)
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm3\n"
+        "movd               %[sin],     %%xmm4\n"
+        "pslld                 $16,     %%xmm4\n"
+        "pslld                 $16,     %%xmm3\n"
+        "psrld                 $16,     %%xmm3\n"
+        "pshufd      $0,    %%xmm3,     %%xmm3\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "por                %%xmm4,     %%xmm3\n"
+        "movd               %[sat],     %%xmm4\n"
+        "pshufd      $0,    %%xmm4,     %%xmm4\n"
+        "pcmpeqb            %%xmm6,     %%xmm6\n"
+        "psrlw                 $15,     %%xmm6\n"
+        "psllw                  $7,     %%xmm6\n"
+#else
+        "pxor               %%xmm0,     %%xmm0\n"
+        "movd               %[cos],     %%xmm6\n"
+        "movd               %[sin],     %%xmm7\n"
+        "pslld                 $16,     %%xmm7\n"
+        "pslld                 $16,     %%xmm6\n"
+        "psrld                 $16,     %%xmm6\n"
+        "pshufd      $0,    %%xmm6,     %%xmm6\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+        "por                %%xmm7,     %%xmm6\n"
+        "movd               %[sat],     %%xmm7\n"
+        "movd                 %[x],    %%xmm10\n"
+        "movd                 %[y],    %%xmm11\n"
+        "pshufd      $0,   %%xmm10,    %%xmm10\n"
+        "pshufd      $0,   %%xmm11,    %%xmm11\n"
+        "pshufd      $0,    %%xmm7,     %%xmm7\n"
+
+        "pcmpeqb           %%xmm12,    %%xmm12\n"
+        "pcmpeqb            %%xmm9,     %%xmm9\n"
+        "pcmpeqb           %%xmm13,    %%xmm13\n"
+        "psrlw                 $15,     %%xmm9\n"
+        "psllw                  $7,     %%xmm9\n"
+        "psrlw                 $15,    %%xmm13\n"
+        "mov               $0x8000,      %%eax\n"
+        "movd                %%eax,    %%xmm14\n"
+        "pshufd      $0,   %%xmm14,    %%xmm14\n"
+        "mov           $0x80008000,      %%eax\n"
+        "movd                %%eax,    %%xmm15\n"
+        "pshufd      $0,   %%xmm15,    %%xmm15\n"
+#endif
+    :
+    :
+      [x]      "r" ( i_x ),
+      [y]      "r" ( i_y ),
+      [sat]    "r" ( i_sat_sub ),
+      [sin]    "r" ( i_sin ),
+      [cos]    "r" ( i_cos )
+    : "eax", "memory" );
+}
+
+/*****************************************************************************
+ * WRITE_UV preparation and finishing macros
+ * - they prepare the registers to obtain needed data to work in adjust filter
+ *****************************************************************************/
+#define WRITE_UV_CLIP_PLANAR_PREPARE \
+    STORE_XMM_REGISTERS; \
+    write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_PLANAR_PREPARE \
+    STORE_XMM_REGISTERS; \
+    write_uv_prepare_planar( i_x, i_y, i_cos, i_sin, i_sat_sub );
+
+#define WRITE_UV_CLIP_PACKED_PREPARE \
+    STORE_XMM_REGISTERS; \
+    write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_PACKED_PREPARE \
+    STORE_XMM_REGISTERS; \
+    write_uv_prepare_sse2( i_x, i_y, i_cos, i_sin, i_sat );
+
+#define WRITE_UV_SSE_FINISH \
+    RESTORE_XMM_REGISTERS;
+
+/*****************************************************************************
+ * WRITE_UV macros declaration
+ * - they work in same way as WRITE_UV() and WRITE_UV_CLIP() and that means
+ *   even identical names of variables used in them
+ *****************************************************************************/
+#define WRITE_UV_CLIP_PLANAR_SSE4_1() \
+    write_uv_load_planaryuv_sse2( p_in, p_in_v ); \
+    write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+    write_uv_store_planaryuv_sse2( p_out, p_out_v ); \
+    p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#define WRITE_UV_CLIP_PACKED_SSE4_1() \
+    write_uv_load_packedyuv_sse2( p_in, p_in_v ); \
+    write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+    write_uv_clip_packedyuv_sse2(); \
+    write_uv_store_packedyuv_sse2( p_out, p_out_v ); \
+    p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#define WRITE_UV_PACKED_SSE4_1() \
+    write_uv_load_packedyuv_sse2( p_in, p_in_v ); \
+    write_uv_base_sse41( i_x, i_y, i_cos, i_sin, i_sat ); \
+    write_uv_clear_packedyuv_sse2(); \
+    write_uv_store_packedyuv_sse2( p_out, p_out_v ); \
+    p_out += 4; p_out_v += 4; p_in += 4; p_in_v += 4
+
+#ifdef __x86_64__
+#define WRITE_UV_PLANAR_SSE2() \
+    write_uv_load_planaryuv_first( p_in, p_in_v); \
+    write_uv_base_first( i_x ); \
+    write_uv_store_planaryuv_first( p_out, p_out_v ); \
+    p_in += 8; p_in_v += 8; p_out += 8; p_out_v += 8
+#endif
+
+#ifdef __i386__
+#define WRITE_UV_PLANAR_SSE2() \
+    write_uv_load_planaryuv_first( p_in, p_in_v); \
+    write_uv_base_first( i_x ); \
+    write_uv_store_planaryuv_first( p_out, p_out_v ); \
+    write_uv_load_planaryuv_second( p_in, p_in_v); \
+    write_uv_base_second( i_y ); \
+    write_uv_store_planaryuv_second( p_out_v ); \
+    p_in += 8; p_in_v += 8; p_out += 8; p_out_v += 8
+#endif
-- 
1.7.1