[vlc-devel] [PATCH] MediaCodec: use SSE copy function from the avcodec module if the CPU supports it.

Fri Oct 25 15:03:54 CEST 2013

Move files copy.{c,h} to modules/video_chroma/.
---
 modules/codec/Makefile.am                |  13 +-
 modules/codec/avcodec/copy.c             | 416 -------------------------------
 modules/codec/avcodec/copy.h             |  45 ----
 modules/codec/avcodec/dxva2.c            |   2 +-
 modules/codec/avcodec/vaapi.c            |   2 +-
 modules/codec/avcodec/vda.c              |   2 +-
 modules/codec/omxil/android_mediacodec.c |  10 +-
 modules/codec/omxil/omxil.c              |   2 +-
 modules/codec/omxil/omxil_utils.h        |  15 +-
 modules/codec/omxil/utils.c              |  52 +++-
 modules/video_chroma/copy.c              | 416 +++++++++++++++++++++++++++++++
 modules/video_chroma/copy.h              |  45 ++++
 12 files changed, 546 insertions(+), 474 deletions(-)
 delete mode 100644 modules/codec/avcodec/copy.c
 delete mode 100644 modules/codec/avcodec/copy.h
 create mode 100644 modules/video_chroma/copy.c
 create mode 100644 modules/video_chroma/copy.h

diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
index e65945a..ef89f64 100644
--- a/modules/codec/Makefile.am
+++ b/modules/codec/Makefile.am
@@ -263,7 +263,7 @@ endif
 ### avcodec hardware acceleration ###
 
 libvaapi_plugin_la_SOURCES = \
-	codec/avcodec/copy.c codec/avcodec/copy.h \
+	video_chroma/copy.c codec/avcodec/copy.h \
 	codec/avcodec/vaapi.c
 libvaapi_plugin_la_CFLAGS = $(AM_CFLAGS) \
 	$(LIBVA_CFLAGS) $(X_CFLAGS) $(AVCODEC_CFLAGS)
@@ -273,7 +273,7 @@ codec_LTLIBRARIES += libvaapi_plugin.la
 endif
 
 libdxva2_plugin_la_SOURCES = \
-	codec/avcodec/copy.c codec/avcodec/copy.h \
+	video_chroma/copy.c codec/avcodec/copy.h \
 	codec/avcodec/dxva2.c
 libdxva2_plugin_la_LIBADD = -lole32 -lshlwapi -luuid
 if HAVE_AVCODEC_DXVA2
@@ -281,7 +281,7 @@ codec_LTLIBRARIES += libdxva2_plugin.la
 endif
 
 libvda_plugin_la_SOURCES = \
-	codec/avcodec/copy.c codec/avcodec/copy.h \
+	video_chroma/copy.c codec/avcodec/copy.h \
 	codec/avcodec/vda.c
 libvda_plugin_la_CFLAGS = $(AM_CFLAGS) $(AVCODEC_CFLAGS)
 libvda_plugin_la_LDFLAGS = -Wl,-framework,CoreFoundation,-framework,VideoDecodeAcceleration,-framework,QuartzCore
@@ -317,12 +317,13 @@ libomxil_plugin_la_SOURCES = \
 	codec/omxil/utils.c codec/omxil/omxil_utils.h \
 	codec/h264_nal.h \
 	codec/omxil/qcom.c codec/omxil/qcom.h \
-	codec/omxil/omxil.c codec/omxil/omxil.h codec/omxil/omxil_core.c codec/omxil/omxil_core.h
+	codec/omxil/omxil.c codec/omxil/omxil.h codec/omxil/omxil_core.c codec/omxil/omxil_core.h \
+	video_chroma/copy.c
 libomxil_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(srcdir)/codec/omxil $(CFLAGS_omxil)
 libomxil_plugin_la_LIBADD = $(LIBDL)
 libomxil_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
 
-libomxil_vout_plugin_la_SOURCES = codec/omxil/vout.c codec/omxil/omxil_core.c codec/omxil/utils.c codec/omxil/qcom.c
+libomxil_vout_plugin_la_SOURCES = codec/omxil/vout.c codec/omxil/omxil_core.c codec/omxil/utils.c codec/omxil/qcom.c video_chroma/copy.c
 libomxil_vout_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(srcdir)/codec/omxil $(CFLAGS_omxil_vout)
 libomxil_vout_plugin_la_LIBADD = $(LIBDL)
 libomxil_vout_plugin_la_LDFLAGS = $(AM_LDFLAGS) -rpath '$(codecdir)'
@@ -332,7 +333,7 @@ libiomx_plugin_la_CPPFLAGS = $(libomxil_plugin_la_CPPFLAGS) -DUSE_IOMX
 libiomx_plugin_la_LIBADD = $(libomxil_plugin_la_LIBADD)
 
 libmediacodec_plugin_la_CPPFLAGS = $(AM_CPPFLAGS) -I$(srcdir)/codec/omxil
-libmediacodec_plugin_la_SOURCES = codec/omxil/android_mediacodec.c codec/omxil/utils.c
+libmediacodec_plugin_la_SOURCES = codec/omxil/android_mediacodec.c codec/omxil/utils.c video_chroma/copy.c
 
 codec_LTLIBRARIES += $(LTLIBomxil) $(LTLIBomxil_vout)
 EXTRA_LTLIBRARIES += libomxil_plugin.la libomxil_vout_plugin.la
diff --git a/modules/codec/avcodec/copy.c b/modules/codec/avcodec/copy.c
deleted file mode 100644
index e0823aa..0000000
--- a/modules/codec/avcodec/copy.c
+++ /dev/null
@@ -1,416 +0,0 @@
-/*****************************************************************************
- * copy.c: Fast YV12/NV12 copy
- *****************************************************************************
- * Copyright (C) 2010 Laurent Aimar
- * $Id$
- *
- * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
- *****************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <vlc_common.h>
-#include <vlc_picture.h>
-#include <vlc_cpu.h>
-#include <assert.h>
-
-#include "copy.h"
-
-int CopyInitCache(copy_cache_t *cache, unsigned width)
-{
-#ifdef CAN_COMPILE_SSE2
-    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
-    cache->buffer = vlc_memalign(16, cache->size);
-    if (!cache->buffer)
-        return VLC_EGENERIC;
-#else
-    (void) cache; (void) width;
-#endif
-    return VLC_SUCCESS;
-}
-
-void CopyCleanCache(copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
-    vlc_free(cache->buffer);
-    cache->buffer = NULL;
-    cache->size   = 0;
-#else
-    (void) cache;
-#endif
-}
-
-#ifdef CAN_COMPILE_SSE2
-/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
- * load and storing data with the SSE>=2 instruction store.
- */
-#define COPY64(dstp, srcp, load, store) \
-    asm volatile (                      \
-        load "  0(%[src]), %%xmm1\n"    \
-        load " 16(%[src]), %%xmm2\n"    \
-        load " 32(%[src]), %%xmm3\n"    \
-        load " 48(%[src]), %%xmm4\n"    \
-        store " %%xmm1,    0(%[dst])\n" \
-        store " %%xmm2,   16(%[dst])\n" \
-        store " %%xmm3,   32(%[dst])\n" \
-        store " %%xmm4,   48(%[dst])\n" \
-        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
-
-#ifndef __SSE4_1__
-# undef vlc_CPU_SSE4_1
-# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
-#endif
-
-#ifndef __SSSE3__
-# undef vlc_CPU_SSSE3
-# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
-#endif
-
-#ifndef __SSE2__
-# undef vlc_CPU_SSE2
-# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
-#endif
-
-/* Optimized copy from "Uncacheable Speculative Write Combining" memory
- * as used by some video surface.
- * XXX It is really efficient only when SSE4.1 is available.
- */
-VLC_SSE
-static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
-                         const uint8_t *src, size_t src_pitch,
-                         unsigned width, unsigned height,
-                         unsigned cpu)
-{
-    assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
-
-    asm volatile ("mfence");
-
-    for (unsigned y = 0; y < height; y++) {
-        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
-        unsigned x = 0;
-
-        for (; x < unaligned; x++)
-            dst[x] = src[x];
-
-#ifdef CAN_COMPILE_SSE4_1
-        if (vlc_CPU_SSE4_1()) {
-            if (!unaligned) {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
-            } else {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
-            }
-        } else
-#endif
-        {
-            if (!unaligned) {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
-            } else {
-                for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
-            }
-        }
-
-        for (; x < width; x++)
-            dst[x] = src[x];
-
-        src += src_pitch;
-        dst += dst_pitch;
-    }
-}
-
-VLC_SSE
-static void Copy2d(uint8_t *dst, size_t dst_pitch,
-                   const uint8_t *src, size_t src_pitch,
-                   unsigned width, unsigned height)
-{
-    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
-    asm volatile ("mfence");
-
-    for (unsigned y = 0; y < height; y++) {
-        unsigned x = 0;
-
-        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
-        if (!unaligned) {
-            for (; x+63 < width; x += 64)
-                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
-        } else {
-            for (; x+63 < width; x += 64)
-                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
-        }
-
-        for (; x < width; x++)
-            dst[x] = src[x];
-
-        src += src_pitch;
-        dst += dst_pitch;
-    }
-}
-
-VLC_SSE
-static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
-                        uint8_t *dstv, size_t dstv_pitch,
-                        const uint8_t *src, size_t src_pitch,
-                        unsigned width, unsigned height, unsigned cpu)
-{
-    const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
-                                1, 3, 5, 7, 9, 11, 13, 15 };
-    const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
-                             0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
-
-    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
-    asm volatile ("mfence");
-
-    for (unsigned y = 0; y < height; y++) {
-        unsigned x = 0;
-
-#define LOAD64 \
-    "movdqa  0(%[src]), %%xmm0\n" \
-    "movdqa 16(%[src]), %%xmm1\n" \
-    "movdqa 32(%[src]), %%xmm2\n" \
-    "movdqa 48(%[src]), %%xmm3\n"
-
-#define STORE2X32 \
-    "movq   %%xmm0,   0(%[dst1])\n" \
-    "movq   %%xmm1,   8(%[dst1])\n" \
-    "movhpd %%xmm0,   0(%[dst2])\n" \
-    "movhpd %%xmm1,   8(%[dst2])\n" \
-    "movq   %%xmm2,  16(%[dst1])\n" \
-    "movq   %%xmm3,  24(%[dst1])\n" \
-    "movhpd %%xmm2,  16(%[dst2])\n" \
-    "movhpd %%xmm3,  24(%[dst2])\n"
-
-#ifdef CAN_COMPILE_SSSE3
-        if (vlc_CPU_SSSE3())
-        {
-            for (x = 0; x < (width & ~31); x += 32) {
-                asm volatile (
-                    "movdqu (%[shuffle]), %%xmm7\n"
-                    LOAD64
-                    "pshufb  %%xmm7, %%xmm0\n"
-                    "pshufb  %%xmm7, %%xmm1\n"
-                    "pshufb  %%xmm7, %%xmm2\n"
-                    "pshufb  %%xmm7, %%xmm3\n"
-                    STORE2X32
-                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
-            }
-        } else
-#endif
-        {
-            for (x = 0; x < (width & ~31); x += 32) {
-                asm volatile (
-                    "movdqu (%[mask]), %%xmm7\n"
-                    LOAD64
-                    "movdqa   %%xmm0, %%xmm4\n"
-                    "movdqa   %%xmm1, %%xmm5\n"
-                    "movdqa   %%xmm2, %%xmm6\n"
-                    "psrlw    $8,     %%xmm0\n"
-                    "psrlw    $8,     %%xmm1\n"
-                    "pand     %%xmm7, %%xmm4\n"
-                    "pand     %%xmm7, %%xmm5\n"
-                    "pand     %%xmm7, %%xmm6\n"
-                    "packuswb %%xmm4, %%xmm0\n"
-                    "packuswb %%xmm5, %%xmm1\n"
-                    "pand     %%xmm3, %%xmm7\n"
-                    "psrlw    $8,     %%xmm2\n"
-                    "psrlw    $8,     %%xmm3\n"
-                    "packuswb %%xmm6, %%xmm2\n"
-                    "packuswb %%xmm7, %%xmm3\n"
-                    STORE2X32
-                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
-            }
-        }
-#undef STORE2X32
-#undef LOAD64
-
-        for (; x < width; x++) {
-            dstu[x] = src[2*x+0];
-            dstv[x] = src[2*x+1];
-        }
-        src  += src_pitch;
-        dstu += dstu_pitch;
-        dstv += dstv_pitch;
-    }
-}
-
-static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
-                          const uint8_t *src, size_t src_pitch,
-                          uint8_t *cache, size_t cache_size,
-                          unsigned width, unsigned height, unsigned cpu)
-{
-    const unsigned w16 = (width+15) & ~15;
-    const unsigned hstep = cache_size / w16;
-    assert(hstep > 0);
-
-    for (unsigned y = 0; y < height; y += hstep) {
-        const unsigned hblock =  __MIN(hstep, height - y);
-
-        /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w16,
-                     src, src_pitch,
-                     width, hblock, cpu);
-
-        /* Copy from our cache to the destination */
-        Copy2d(dst, dst_pitch,
-               cache, w16,
-               width, hblock);
-
-        /* */
-        src += src_pitch * hblock;
-        dst += dst_pitch * hblock;
-    }
-    asm volatile ("mfence");
-}
-
-static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
-                            uint8_t *dstv, size_t dstv_pitch,
-                            const uint8_t *src, size_t src_pitch,
-                            uint8_t *cache, size_t cache_size,
-                            unsigned width, unsigned height, unsigned cpu)
-{
-    const unsigned w2_16 = (2*width+15) & ~15;
-    const unsigned hstep = cache_size / w2_16;
-    assert(hstep > 0);
-
-    for (unsigned y = 0; y < height; y += hstep) {
-        const unsigned hblock =  __MIN(hstep, height - y);
-
-        /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w2_16, src, src_pitch,
-                     2*width, hblock, cpu);
-
-        /* Copy from our cache to the destination */
-        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
-                    cache, w2_16, width, hblock, cpu);
-
-        /* */
-        src  += src_pitch  * hblock;
-        dstu += dstu_pitch * hblock;
-        dstv += dstv_pitch * hblock;
-    }
-    asm volatile ("mfence");
-}
-
-static void SSE_CopyFromNv12(picture_t *dst,
-                             uint8_t *src[2], size_t src_pitch[2],
-                             unsigned width, unsigned height,
-                             copy_cache_t *cache, unsigned cpu)
-{
-    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
-                  src[0], src_pitch[0],
-                  cache->buffer, cache->size,
-                  width, height, cpu);
-    SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
-                    dst->p[1].p_pixels, dst->p[1].i_pitch,
-                    src[1], src_pitch[1],
-                    cache->buffer, cache->size,
-                    width/2, height/2, cpu);
-    asm volatile ("emms");
-}
-
-static void SSE_CopyFromYv12(picture_t *dst,
-                             uint8_t *src[3], size_t src_pitch[3],
-                             unsigned width, unsigned height,
-                             copy_cache_t *cache, unsigned cpu)
-{
-    for (unsigned n = 0; n < 3; n++) {
-        const unsigned d = n > 0 ? 2 : 1;
-        SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
-                      src[n], src_pitch[n],
-                      cache->buffer, cache->size,
-                      width/d, height/d, cpu);
-    }
-    asm volatile ("emms");
-}
-#undef COPY64
-#endif /* CAN_COMPILE_SSE2 */
-
-static void CopyPlane(uint8_t *dst, size_t dst_pitch,
-                      const uint8_t *src, size_t src_pitch,
-                      unsigned width, unsigned height)
-{
-    for (unsigned y = 0; y < height; y++) {
-        memcpy(dst, src, width);
-        src += src_pitch;
-        dst += dst_pitch;
-    }
-}
-
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
-                        uint8_t *dstv, size_t dstv_pitch,
-                        const uint8_t *src, size_t src_pitch,
-                        unsigned width, unsigned height)
-{
-    for (unsigned y = 0; y < height; y++) {
-        for (unsigned x = 0; x < width; x++) {
-            dstu[x] = src[2*x+0];
-            dstv[x] = src[2*x+1];
-        }
-        src  += src_pitch;
-        dstu += dstu_pitch;
-        dstv += dstv_pitch;
-    }
-}
-
-void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
-                  unsigned width, unsigned height,
-                  copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
-    unsigned cpu = vlc_CPU();
-    if (vlc_CPU_SSE2())
-        return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
-                                cache, cpu);
-#else
-    (void) cache;
-#endif
-
-    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
-              src[0], src_pitch[0],
-              width, height);
-    SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
-                dst->p[1].p_pixels, dst->p[1].i_pitch,
-                src[1], src_pitch[1],
-                width/2, height/2);
-}
-
-void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
-                  unsigned width, unsigned height,
-                  copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
-    unsigned cpu = vlc_CPU();
-    if (vlc_CPU_SSE2())
-        return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
-                                cache, cpu);
-#else
-    (void) cache;
-#endif
-
-     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
-               src[0], src_pitch[0], width, height);
-     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
-               src[1], src_pitch[1], width / 2, height / 2);
-     CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
-               src[1], src_pitch[2], width / 2, height / 2);
-}
diff --git a/modules/codec/avcodec/copy.h b/modules/codec/avcodec/copy.h
deleted file mode 100644
index 242dd51..0000000
--- a/modules/codec/avcodec/copy.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*****************************************************************************
- * copy.h: Fast YV12/NV12 copy
- *****************************************************************************
- * Copyright (C) 2009 Laurent Aimar
- * $Id$
- *
- * Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
- *****************************************************************************/
-
-#ifndef _VLC_AVCODEC_COPY_H
-#define _VLC_AVCODEC_COPY_H 1
-
-typedef struct {
-# ifdef CAN_COMPILE_SSE2
-    uint8_t *buffer;
-    size_t  size;
-# endif
-} copy_cache_t;
-
-int  CopyInitCache(copy_cache_t *cache, unsigned width);
-void CopyCleanCache(copy_cache_t *cache);
-
-void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
-                  unsigned width, unsigned height,
-                  copy_cache_t *cache);
-void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
-                  unsigned width, unsigned height,
-                  copy_cache_t *cache);
-
-#endif
-
diff --git a/modules/codec/avcodec/dxva2.c b/modules/codec/avcodec/dxva2.c
index 2ef69ec..330daf0 100644
--- a/modules/codec/avcodec/dxva2.c
+++ b/modules/codec/avcodec/dxva2.c
@@ -49,7 +49,7 @@
 
 #include "avcodec.h"
 #include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
 
 static int Open(vlc_va_t *, int, const es_format_t *);
 static void Close(vlc_va_t *);
diff --git a/modules/codec/avcodec/vaapi.c b/modules/codec/avcodec/vaapi.c
index 5a1ddda..1a7b227 100644
--- a/modules/codec/avcodec/vaapi.c
+++ b/modules/codec/avcodec/vaapi.c
@@ -39,7 +39,7 @@
 
 #include "avcodec.h"
 #include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
 
 #ifndef VA_SURFACE_ATTRIB_SETTABLE
 #define vaCreateSurfaces(d, f, w, h, s, ns, a, na) \
diff --git a/modules/codec/avcodec/vda.c b/modules/codec/avcodec/vda.c
index 0bea754..7103491 100644
--- a/modules/codec/avcodec/vda.c
+++ b/modules/codec/avcodec/vda.c
@@ -34,7 +34,7 @@
 
 #include "avcodec.h"
 #include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
 
 #include <libavcodec/vda.h>
 #include <VideoDecodeAcceleration/VDADecoder.h>
diff --git a/modules/codec/omxil/android_mediacodec.c b/modules/codec/omxil/android_mediacodec.c
index d8cf09f..fb435b4 100644
--- a/modules/codec/omxil/android_mediacodec.c
+++ b/modules/codec/omxil/android_mediacodec.c
@@ -75,6 +75,8 @@ struct decoder_sys_t
 
     int started;
     int decoded;
+
+    ArchitectureSpecificCopyData architecture_specific_data;
 };
 
 enum Types
@@ -389,6 +391,7 @@ static void CloseDecoder(vlc_object_t *p_this)
     (*myVm)->DetachCurrentThread(myVm);
 
     free(p_sys->name);
+    ArchitectureSpecificCopyHooksDestroy(p_sys->pixel_format, &p_sys->architecture_specific_data);
     free(p_sys);
 }
 
@@ -422,7 +425,7 @@ static void GetOutput(decoder_t *p_dec, JNIEnv *env, picture_t **pp_pic, int loo
                 GetVlcChromaSizes(p_dec->fmt_out.i_codec, p_dec->fmt_out.video.i_width,
                                   p_dec->fmt_out.video.i_height, NULL, NULL, &chroma_div);
                 CopyOmxPicture(p_sys->pixel_format, p_pic, p_sys->slice_height, p_sys->stride,
-                               ptr, chroma_div);
+                               ptr, chroma_div, &p_sys->architecture_specific_data);
             }
             (*env)->CallVoidMethod(env, p_sys->codec, p_sys->release_output_buffer, index, false);
             jthrowable exception = (*env)->ExceptionOccurred(env);
@@ -452,6 +455,8 @@ static void GetOutput(decoder_t *p_dec, JNIEnv *env, picture_t **pp_pic, int loo
             msg_Dbg(p_dec, "output format changed: %.*s", format_len, format_ptr);
             (*env)->ReleaseStringUTFChars(env, format_string, format_ptr);
 
+            ArchitectureSpecificCopyHooksDestroy(p_sys->pixel_format, &p_sys->architecture_specific_data);
+
             int width           = GET_INTEGER(format, "width");
             int height          = GET_INTEGER(format, "height");
             p_sys->stride       = GET_INTEGER(format, "stride");
@@ -476,6 +481,9 @@ static void GetOutput(decoder_t *p_dec, JNIEnv *env, picture_t **pp_pic, int loo
                 p_sys->slice_height = height;
             if ((*env)->ExceptionOccurred(env))
                 (*env)->ExceptionClear(env);
+
+            ArchitectureSpecificCopyHooks(p_dec, p_sys->pixel_format, p_sys->slice_height,
+                                          p_sys->stride, &p_sys->architecture_specific_data);
             if (p_sys->pixel_format == OMX_TI_COLOR_FormatYUV420PackedSemiPlanar) {
                 p_sys->slice_height -= p_sys->crop_top/2;
                 /* Reset crop top/left here, since the offset parameter already includes this.
diff --git a/modules/codec/omxil/omxil.c b/modules/codec/omxil/omxil.c
index da86e23..3e4c104 100644
--- a/modules/codec/omxil/omxil.c
+++ b/modules/codec/omxil/omxil.c
@@ -1271,7 +1271,7 @@ static picture_t *DecodeVideo( decoder_t *p_dec, block_t **pp_block )
                                    p_pic, p_sys->out.definition.format.video.nSliceHeight,
                                    p_sys->out.i_frame_stride,
                                    p_header->pBuffer + p_header->nOffset,
-                                   p_sys->out.i_frame_stride_chroma_div);
+                                   p_sys->out.i_frame_stride_chroma_div, NULL);
             }
 
             if (p_pic)
diff --git a/modules/codec/omxil/omxil_utils.h b/modules/codec/omxil/omxil_utils.h
index 1a99dfc..6aa9e12 100644
--- a/modules/codec/omxil/omxil_utils.h
+++ b/modules/codec/omxil/omxil_utils.h
@@ -180,9 +180,22 @@ void PrintOmxEvent(vlc_object_t *p_this, OMX_EVENTTYPE event, OMX_U32 data_1,
 /*****************************************************************************
  * Picture utility functions
  *****************************************************************************/
+typedef struct ArchitectureSpecificCopyData
+{
+    void *data;
+} ArchitectureSpecificCopyData;
+
+void ArchitectureSpecificCopyHooks( decoder_t *p_dec, int i_color_format,
+                                    int i_slice_height, int i_src_stride,
+                                    ArchitectureSpecificCopyData *p_architecture_specific );
+
+void ArchitectureSpecificCopyHooksDestroy( int i_color_format,
+                                           ArchitectureSpecificCopyData *p_architecture_specific );
+
 void CopyOmxPicture( int i_color_format, picture_t *p_pic,
                      int i_slice_height,
-                     int i_src_stride, uint8_t *p_src, int i_chroma_div );
+                     int i_src_stride, uint8_t *p_src, int i_chroma_div,
+                     ArchitectureSpecificCopyData *p_architecture_specific );
 
 void CopyVlcPicture( decoder_t *, OMX_BUFFERHEADERTYPE *, picture_t * );
 
diff --git a/modules/codec/omxil/utils.c b/modules/codec/omxil/utils.c
index e4665c5..7913318 100644
--- a/modules/codec/omxil/utils.c
+++ b/modules/codec/omxil/utils.c
@@ -36,6 +36,7 @@
 
 #include "omxil.h"
 #include "qcom.h"
+#include "../../video_chroma/copy.h"
 
 /*****************************************************************************
  * Events utility functions
@@ -163,9 +164,47 @@ void PrintOmxEvent(vlc_object_t *p_this, OMX_EVENTTYPE event, OMX_U32 data_1,
 /*****************************************************************************
  * Picture utility functions
  *****************************************************************************/
+void ArchitectureSpecificCopyHooks( decoder_t *p_dec, int i_color_format,
+                                    int i_slice_height, int i_src_stride,
+                                    ArchitectureSpecificCopyData *p_architecture_specific )
+{
+    (void)i_slice_height;
+
+#ifdef CAN_COMPILE_SSE2
+    if( i_color_format == OMX_COLOR_FormatYUV420SemiPlanar && vlc_CPU_SSE2() )
+    {
+        copy_cache_t *p_surface_cache = malloc( sizeof(copy_cache_t) );
+        if( !p_surface_cache || CopyInitCache( p_surface_cache, i_src_stride ) )
+        {
+            free( p_surface_cache );
+            return;
+        }
+        p_architecture_specific->data = p_surface_cache;
+        p_dec->fmt_out.i_codec = VLC_CODEC_YV12;
+    }
+#endif
+}
+
+void ArchitectureSpecificCopyHooksDestroy( int i_color_format,
+                                           ArchitectureSpecificCopyData *p_architecture_specific )
+{
+    if (!p_architecture_specific->data)
+        return;
+#ifdef CAN_COMPILE_SSE2
+    if( i_color_format == OMX_COLOR_FormatYUV420SemiPlanar && vlc_CPU_SSE2() )
+    {
+        copy_cache_t *p_surface_cache = (copy_cache_t*)p_architecture_specific->data;
+        CopyCleanCache(p_surface_cache);
+    }
+#endif
+    free(p_architecture_specific->data);
+    p_architecture_specific->data = NULL;
+}
+
 void CopyOmxPicture( int i_color_format, picture_t *p_pic,
                      int i_slice_height,
-                     int i_src_stride, uint8_t *p_src, int i_chroma_div )
+                     int i_src_stride, uint8_t *p_src, int i_chroma_div,
+                     ArchitectureSpecificCopyData *p_architecture_specific )
 {
     uint8_t *p_dst;
     int i_dst_stride;
@@ -175,6 +214,17 @@ void CopyOmxPicture( int i_color_format, picture_t *p_pic,
         qcom_convert(p_src, p_pic);
         return;
     }
+#ifdef CAN_COMPILE_SSE2
+    if( i_color_format == OMX_COLOR_FormatYUV420SemiPlanar
+        && vlc_CPU_SSE2() && p_architecture_specific )
+    {
+        copy_cache_t *p_surface_cache = (copy_cache_t*)p_architecture_specific->data;
+        uint8_t *ppi_src_pointers[2] = { p_src, p_src + i_src_stride * i_slice_height };
+        size_t pi_src_strides[2] = { i_src_stride, i_src_stride };
+        CopyFromNv12( p_pic, ppi_src_pointers, pi_src_strides, i_src_stride, i_slice_height, p_surface_cache );
+        return;
+    }
+#endif
 
     for( i_plane = 0; i_plane < p_pic->i_planes; i_plane++ )
     {
diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
new file mode 100644
index 0000000..e0823aa
--- /dev/null
+++ b/modules/video_chroma/copy.c
@@ -0,0 +1,416 @@
+/*****************************************************************************
+ * copy.c: Fast YV12/NV12 copy
+ *****************************************************************************
+ * Copyright (C) 2010 Laurent Aimar
+ * $Id$
+ *
+ * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_picture.h>
+#include <vlc_cpu.h>
+#include <assert.h>
+
+#include "copy.h"
+
+int CopyInitCache(copy_cache_t *cache, unsigned width)
+{
+#ifdef CAN_COMPILE_SSE2
+    cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
+    cache->buffer = vlc_memalign(16, cache->size);
+    if (!cache->buffer)
+        return VLC_EGENERIC;
+#else
+    (void) cache; (void) width;
+#endif
+    return VLC_SUCCESS;
+}
+
+void CopyCleanCache(copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+    vlc_free(cache->buffer);
+    cache->buffer = NULL;
+    cache->size   = 0;
+#else
+    (void) cache;
+#endif
+}
+
+#ifdef CAN_COMPILE_SSE2
+/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
+ */
+#define COPY64(dstp, srcp, load, store) \
+    asm volatile (                      \
+        load "  0(%[src]), %%xmm1\n"    \
+        load " 16(%[src]), %%xmm2\n"    \
+        load " 32(%[src]), %%xmm3\n"    \
+        load " 48(%[src]), %%xmm4\n"    \
+        store " %%xmm1,    0(%[dst])\n" \
+        store " %%xmm2,   16(%[dst])\n" \
+        store " %%xmm3,   32(%[dst])\n" \
+        store " %%xmm4,   48(%[dst])\n" \
+        : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
+
+#ifndef __SSE4_1__
+# undef vlc_CPU_SSE4_1
+# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
+#endif
+
+#ifndef __SSSE3__
+# undef vlc_CPU_SSSE3
+# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
+#endif
+
+#ifndef __SSE2__
+# undef vlc_CPU_SSE2
+# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
+#endif
+
+/* Optimized copy from "Uncacheable Speculative Write Combining" memory
+ * as used by some video surface.
+ * XXX It is really efficient only when SSE4.1 is available.
+ */
+VLC_SSE
+static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
+                         const uint8_t *src, size_t src_pitch,
+                         unsigned width, unsigned height,
+                         unsigned cpu)
+{
+    assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
+
+    asm volatile ("mfence");
+
+    for (unsigned y = 0; y < height; y++) {
+        const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+        unsigned x = 0;
+
+        for (; x < unaligned; x++)
+            dst[x] = src[x];
+
+#ifdef CAN_COMPILE_SSE4_1
+        if (vlc_CPU_SSE4_1()) {
+            if (!unaligned) {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
+            } else {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
+            }
+        } else
+#endif
+        {
+            if (!unaligned) {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+            } else {
+                for (; x+63 < width; x += 64)
+                    COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+            }
+        }
+
+        for (; x < width; x++)
+            dst[x] = src[x];
+
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+VLC_SSE
+static void Copy2d(uint8_t *dst, size_t dst_pitch,
+                   const uint8_t *src, size_t src_pitch,
+                   unsigned width, unsigned height)
+{
+    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
+
+    asm volatile ("mfence");
+
+    for (unsigned y = 0; y < height; y++) {
+        unsigned x = 0;
+
+        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
+        if (!unaligned) {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
+        } else {
+            for (; x+63 < width; x += 64)
+                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+        }
+
+        for (; x < width; x++)
+            dst[x] = src[x];
+
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+VLC_SSE
+static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height, unsigned cpu)
+{
+    const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
+                                1, 3, 5, 7, 9, 11, 13, 15 };
+    const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
+                             0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
+
+    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
+
+    asm volatile ("mfence");
+
+    for (unsigned y = 0; y < height; y++) {
+        unsigned x = 0;
+
+#define LOAD64 \
+    "movdqa  0(%[src]), %%xmm0\n" \
+    "movdqa 16(%[src]), %%xmm1\n" \
+    "movdqa 32(%[src]), %%xmm2\n" \
+    "movdqa 48(%[src]), %%xmm3\n"
+
+#define STORE2X32 \
+    "movq   %%xmm0,   0(%[dst1])\n" \
+    "movq   %%xmm1,   8(%[dst1])\n" \
+    "movhpd %%xmm0,   0(%[dst2])\n" \
+    "movhpd %%xmm1,   8(%[dst2])\n" \
+    "movq   %%xmm2,  16(%[dst1])\n" \
+    "movq   %%xmm3,  24(%[dst1])\n" \
+    "movhpd %%xmm2,  16(%[dst2])\n" \
+    "movhpd %%xmm3,  24(%[dst2])\n"
+
+#ifdef CAN_COMPILE_SSSE3
+        if (vlc_CPU_SSSE3())
+        {
+            for (x = 0; x < (width & ~31); x += 32) {
+                asm volatile (
+                    "movdqu (%[shuffle]), %%xmm7\n"
+                    LOAD64
+                    "pshufb  %%xmm7, %%xmm0\n"
+                    "pshufb  %%xmm7, %%xmm1\n"
+                    "pshufb  %%xmm7, %%xmm2\n"
+                    "pshufb  %%xmm7, %%xmm3\n"
+                    STORE2X32
+                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+            }
+        } else
+#endif
+        {
+            for (x = 0; x < (width & ~31); x += 32) {
+                asm volatile (
+                    "movdqu (%[mask]), %%xmm7\n"
+                    LOAD64
+                    "movdqa   %%xmm0, %%xmm4\n"
+                    "movdqa   %%xmm1, %%xmm5\n"
+                    "movdqa   %%xmm2, %%xmm6\n"
+                    "psrlw    $8,     %%xmm0\n"
+                    "psrlw    $8,     %%xmm1\n"
+                    "pand     %%xmm7, %%xmm4\n"
+                    "pand     %%xmm7, %%xmm5\n"
+                    "pand     %%xmm7, %%xmm6\n"
+                    "packuswb %%xmm4, %%xmm0\n"
+                    "packuswb %%xmm5, %%xmm1\n"
+                    "pand     %%xmm3, %%xmm7\n"
+                    "psrlw    $8,     %%xmm2\n"
+                    "psrlw    $8,     %%xmm3\n"
+                    "packuswb %%xmm6, %%xmm2\n"
+                    "packuswb %%xmm7, %%xmm3\n"
+                    STORE2X32
+                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+            }
+        }
+#undef STORE2X32
+#undef LOAD64
+
+        for (; x < width; x++) {
+            dstu[x] = src[2*x+0];
+            dstv[x] = src[2*x+1];
+        }
+        src  += src_pitch;
+        dstu += dstu_pitch;
+        dstv += dstv_pitch;
+    }
+}
+
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+                          const uint8_t *src, size_t src_pitch,
+                          uint8_t *cache, size_t cache_size,
+                          unsigned width, unsigned height, unsigned cpu)
+{
+    const unsigned w16 = (width+15) & ~15;
+    const unsigned hstep = cache_size / w16;
+    assert(hstep > 0);
+
+    for (unsigned y = 0; y < height; y += hstep) {
+        const unsigned hblock =  __MIN(hstep, height - y);
+
+        /* Copy a bunch of line into our cache */
+        CopyFromUswc(cache, w16,
+                     src, src_pitch,
+                     width, hblock, cpu);
+
+        /* Copy from our cache to the destination */
+        Copy2d(dst, dst_pitch,
+               cache, w16,
+               width, hblock);
+
+        /* */
+        src += src_pitch * hblock;
+        dst += dst_pitch * hblock;
+    }
+    asm volatile ("mfence");
+}
+
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                            uint8_t *dstv, size_t dstv_pitch,
+                            const uint8_t *src, size_t src_pitch,
+                            uint8_t *cache, size_t cache_size,
+                            unsigned width, unsigned height, unsigned cpu)
+{
+    const unsigned w2_16 = (2*width+15) & ~15;
+    const unsigned hstep = cache_size / w2_16;
+    assert(hstep > 0);
+
+    for (unsigned y = 0; y < height; y += hstep) {
+        const unsigned hblock =  __MIN(hstep, height - y);
+
+        /* Copy a bunch of line into our cache */
+        CopyFromUswc(cache, w2_16, src, src_pitch,
+                     2*width, hblock, cpu);
+
+        /* Copy from our cache to the destination */
+        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
+                    cache, w2_16, width, hblock, cpu);
+
+        /* */
+        src  += src_pitch  * hblock;
+        dstu += dstu_pitch * hblock;
+        dstv += dstv_pitch * hblock;
+    }
+    asm volatile ("mfence");
+}
+
+static void SSE_CopyFromNv12(picture_t *dst,
+                             uint8_t *src[2], size_t src_pitch[2],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache, unsigned cpu)
+{
+    SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+                  src[0], src_pitch[0],
+                  cache->buffer, cache->size,
+                  width, height, cpu);
+    SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+                    dst->p[1].p_pixels, dst->p[1].i_pitch,
+                    src[1], src_pitch[1],
+                    cache->buffer, cache->size,
+                    width/2, height/2, cpu);
+    asm volatile ("emms");
+}
+
+static void SSE_CopyFromYv12(picture_t *dst,
+                             uint8_t *src[3], size_t src_pitch[3],
+                             unsigned width, unsigned height,
+                             copy_cache_t *cache, unsigned cpu)
+{
+    for (unsigned n = 0; n < 3; n++) {
+        const unsigned d = n > 0 ? 2 : 1;
+        SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
+                      src[n], src_pitch[n],
+                      cache->buffer, cache->size,
+                      width/d, height/d, cpu);
+    }
+    asm volatile ("emms");
+}
+#undef COPY64
+#endif /* CAN_COMPILE_SSE2 */
+
+static void CopyPlane(uint8_t *dst, size_t dst_pitch,
+                      const uint8_t *src, size_t src_pitch,
+                      unsigned width, unsigned height)
+{
+    for (unsigned y = 0; y < height; y++) {
+        memcpy(dst, src, width);
+        src += src_pitch;
+        dst += dst_pitch;
+    }
+}
+
+static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                        uint8_t *dstv, size_t dstv_pitch,
+                        const uint8_t *src, size_t src_pitch,
+                        unsigned width, unsigned height)
+{
+    for (unsigned y = 0; y < height; y++) {
+        for (unsigned x = 0; x < width; x++) {
+            dstu[x] = src[2*x+0];
+            dstv[x] = src[2*x+1];
+        }
+        src  += src_pitch;
+        dstu += dstu_pitch;
+        dstv += dstv_pitch;
+    }
+}
+
+void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
+                  unsigned width, unsigned height,
+                  copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (vlc_CPU_SSE2())
+        return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
+                                cache, cpu);
+#else
+    (void) cache;
+#endif
+
+    CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+              src[0], src_pitch[0],
+              width, height);
+    SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+                dst->p[1].p_pixels, dst->p[1].i_pitch,
+                src[1], src_pitch[1],
+                width/2, height/2);
+}
+
+void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
+                  unsigned width, unsigned height,
+                  copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+    unsigned cpu = vlc_CPU();
+    if (vlc_CPU_SSE2())
+        return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
+                                cache, cpu);
+#else
+    (void) cache;
+#endif
+
+     CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+               src[0], src_pitch[0], width, height);
+     CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
+               src[1], src_pitch[1], width / 2, height / 2);
+     CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
+               src[1], src_pitch[2], width / 2, height / 2);
+}
diff --git a/modules/video_chroma/copy.h b/modules/video_chroma/copy.h
new file mode 100644
index 0000000..242dd51
--- /dev/null
+++ b/modules/video_chroma/copy.h
@@ -0,0 +1,45 @@
+/*****************************************************************************
+ * copy.h: Fast YV12/NV12 copy
+ *****************************************************************************
+ * Copyright (C) 2009 Laurent Aimar
+ * $Id$
+ *
+ * Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifndef _VLC_AVCODEC_COPY_H
+#define _VLC_AVCODEC_COPY_H 1
+
+typedef struct {
+# ifdef CAN_COMPILE_SSE2
+    uint8_t *buffer;
+    size_t  size;
+# endif
+} copy_cache_t;
+
+int  CopyInitCache(copy_cache_t *cache, unsigned width);
+void CopyCleanCache(copy_cache_t *cache);
+
+void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
+                  unsigned width, unsigned height,
+                  copy_cache_t *cache);
+void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
+                  unsigned width, unsigned height,
+                  copy_cache_t *cache);
+
+#endif
+
-- 
1.8.3.2