[vlc-devel] [PATCH] Move copy.{c, h} from module avcodec to module video_chroma.
Felix Abecassis
felix.abecassis at gmail.com
Fri Oct 25 16:42:34 CEST 2013
---
modules/codec/Makefile.am | 6 +-
modules/codec/avcodec/copy.c | 416 ------------------------------------------
modules/codec/avcodec/copy.h | 45 -----
modules/codec/avcodec/dxva2.c | 2 +-
modules/codec/avcodec/vaapi.c | 2 +-
modules/codec/avcodec/vda.c | 2 +-
modules/video_chroma/copy.c | 416 ++++++++++++++++++++++++++++++++++++++++++
modules/video_chroma/copy.h | 44 +++++
8 files changed, 466 insertions(+), 467 deletions(-)
delete mode 100644 modules/codec/avcodec/copy.c
delete mode 100644 modules/codec/avcodec/copy.h
create mode 100644 modules/video_chroma/copy.c
create mode 100644 modules/video_chroma/copy.h
diff --git a/modules/codec/Makefile.am b/modules/codec/Makefile.am
index e65945a..6e21d58 100644
--- a/modules/codec/Makefile.am
+++ b/modules/codec/Makefile.am
@@ -263,7 +263,7 @@ endif
### avcodec hardware acceleration ###
libvaapi_plugin_la_SOURCES = \
- codec/avcodec/copy.c codec/avcodec/copy.h \
+ video_chroma/copy.c video_chroma/copy.h \
codec/avcodec/vaapi.c
libvaapi_plugin_la_CFLAGS = $(AM_CFLAGS) \
$(LIBVA_CFLAGS) $(X_CFLAGS) $(AVCODEC_CFLAGS)
@@ -273,7 +273,7 @@ codec_LTLIBRARIES += libvaapi_plugin.la
endif
libdxva2_plugin_la_SOURCES = \
- codec/avcodec/copy.c codec/avcodec/copy.h \
+ video_chroma/copy.c video_chroma/copy.h \
codec/avcodec/dxva2.c
libdxva2_plugin_la_LIBADD = -lole32 -lshlwapi -luuid
if HAVE_AVCODEC_DXVA2
@@ -281,7 +281,7 @@ codec_LTLIBRARIES += libdxva2_plugin.la
endif
libvda_plugin_la_SOURCES = \
- codec/avcodec/copy.c codec/avcodec/copy.h \
+ video_chroma/copy.c video_chroma/copy.h \
codec/avcodec/vda.c
libvda_plugin_la_CFLAGS = $(AM_CFLAGS) $(AVCODEC_CFLAGS)
libvda_plugin_la_LDFLAGS = -Wl,-framework,CoreFoundation,-framework,VideoDecodeAcceleration,-framework,QuartzCore
diff --git a/modules/codec/avcodec/copy.c b/modules/codec/avcodec/copy.c
deleted file mode 100644
index e0823aa..0000000
--- a/modules/codec/avcodec/copy.c
+++ /dev/null
@@ -1,416 +0,0 @@
-/*****************************************************************************
- * copy.c: Fast YV12/NV12 copy
- *****************************************************************************
- * Copyright (C) 2010 Laurent Aimar
- * $Id$
- *
- * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
- *****************************************************************************/
-
-#ifdef HAVE_CONFIG_H
-# include "config.h"
-#endif
-
-#include <vlc_common.h>
-#include <vlc_picture.h>
-#include <vlc_cpu.h>
-#include <assert.h>
-
-#include "copy.h"
-
-int CopyInitCache(copy_cache_t *cache, unsigned width)
-{
-#ifdef CAN_COMPILE_SSE2
- cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
- cache->buffer = vlc_memalign(16, cache->size);
- if (!cache->buffer)
- return VLC_EGENERIC;
-#else
- (void) cache; (void) width;
-#endif
- return VLC_SUCCESS;
-}
-
-void CopyCleanCache(copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
- vlc_free(cache->buffer);
- cache->buffer = NULL;
- cache->size = 0;
-#else
- (void) cache;
-#endif
-}
-
-#ifdef CAN_COMPILE_SSE2
-/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
- * load and storing data with the SSE>=2 instruction store.
- */
-#define COPY64(dstp, srcp, load, store) \
- asm volatile ( \
- load " 0(%[src]), %%xmm1\n" \
- load " 16(%[src]), %%xmm2\n" \
- load " 32(%[src]), %%xmm3\n" \
- load " 48(%[src]), %%xmm4\n" \
- store " %%xmm1, 0(%[dst])\n" \
- store " %%xmm2, 16(%[dst])\n" \
- store " %%xmm3, 32(%[dst])\n" \
- store " %%xmm4, 48(%[dst])\n" \
- : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
-
-#ifndef __SSE4_1__
-# undef vlc_CPU_SSE4_1
-# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
-#endif
-
-#ifndef __SSSE3__
-# undef vlc_CPU_SSSE3
-# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
-#endif
-
-#ifndef __SSE2__
-# undef vlc_CPU_SSE2
-# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
-#endif
-
-/* Optimized copy from "Uncacheable Speculative Write Combining" memory
- * as used by some video surface.
- * XXX It is really efficient only when SSE4.1 is available.
- */
-VLC_SSE
-static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height,
- unsigned cpu)
-{
- assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
-
- asm volatile ("mfence");
-
- for (unsigned y = 0; y < height; y++) {
- const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
- unsigned x = 0;
-
- for (; x < unaligned; x++)
- dst[x] = src[x];
-
-#ifdef CAN_COMPILE_SSE4_1
- if (vlc_CPU_SSE4_1()) {
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
- }
- } else
-#endif
- {
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqa");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqu");
- }
- }
-
- for (; x < width; x++)
- dst[x] = src[x];
-
- src += src_pitch;
- dst += dst_pitch;
- }
-}
-
-VLC_SSE
-static void Copy2d(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height)
-{
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
- asm volatile ("mfence");
-
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
- bool unaligned = ((intptr_t)dst & 0x0f) != 0;
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movntdq");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqu");
- }
-
- for (; x < width; x++)
- dst[x] = src[x];
-
- src += src_pitch;
- dst += dst_pitch;
- }
-}
-
-VLC_SSE
-static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height, unsigned cpu)
-{
- const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
- 1, 3, 5, 7, 9, 11, 13, 15 };
- const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
- 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
-
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
- asm volatile ("mfence");
-
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
-#define LOAD64 \
- "movdqa 0(%[src]), %%xmm0\n" \
- "movdqa 16(%[src]), %%xmm1\n" \
- "movdqa 32(%[src]), %%xmm2\n" \
- "movdqa 48(%[src]), %%xmm3\n"
-
-#define STORE2X32 \
- "movq %%xmm0, 0(%[dst1])\n" \
- "movq %%xmm1, 8(%[dst1])\n" \
- "movhpd %%xmm0, 0(%[dst2])\n" \
- "movhpd %%xmm1, 8(%[dst2])\n" \
- "movq %%xmm2, 16(%[dst1])\n" \
- "movq %%xmm3, 24(%[dst1])\n" \
- "movhpd %%xmm2, 16(%[dst2])\n" \
- "movhpd %%xmm3, 24(%[dst2])\n"
-
-#ifdef CAN_COMPILE_SSSE3
- if (vlc_CPU_SSSE3())
- {
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[shuffle]), %%xmm7\n"
- LOAD64
- "pshufb %%xmm7, %%xmm0\n"
- "pshufb %%xmm7, %%xmm1\n"
- "pshufb %%xmm7, %%xmm2\n"
- "pshufb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
- }
- } else
-#endif
- {
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[mask]), %%xmm7\n"
- LOAD64
- "movdqa %%xmm0, %%xmm4\n"
- "movdqa %%xmm1, %%xmm5\n"
- "movdqa %%xmm2, %%xmm6\n"
- "psrlw $8, %%xmm0\n"
- "psrlw $8, %%xmm1\n"
- "pand %%xmm7, %%xmm4\n"
- "pand %%xmm7, %%xmm5\n"
- "pand %%xmm7, %%xmm6\n"
- "packuswb %%xmm4, %%xmm0\n"
- "packuswb %%xmm5, %%xmm1\n"
- "pand %%xmm3, %%xmm7\n"
- "psrlw $8, %%xmm2\n"
- "psrlw $8, %%xmm3\n"
- "packuswb %%xmm6, %%xmm2\n"
- "packuswb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
- }
- }
-#undef STORE2X32
-#undef LOAD64
-
- for (; x < width; x++) {
- dstu[x] = src[2*x+0];
- dstv[x] = src[2*x+1];
- }
- src += src_pitch;
- dstu += dstu_pitch;
- dstv += dstv_pitch;
- }
-}
-
-static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height, unsigned cpu)
-{
- const unsigned w16 = (width+15) & ~15;
- const unsigned hstep = cache_size / w16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16,
- src, src_pitch,
- width, hblock, cpu);
-
- /* Copy from our cache to the destination */
- Copy2d(dst, dst_pitch,
- cache, w16,
- width, hblock);
-
- /* */
- src += src_pitch * hblock;
- dst += dst_pitch * hblock;
- }
- asm volatile ("mfence");
-}
-
-static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height, unsigned cpu)
-{
- const unsigned w2_16 = (2*width+15) & ~15;
- const unsigned hstep = cache_size / w2_16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w2_16, src, src_pitch,
- 2*width, hblock, cpu);
-
- /* Copy from our cache to the destination */
- SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
- cache, w2_16, width, hblock, cpu);
-
- /* */
- src += src_pitch * hblock;
- dstu += dstu_pitch * hblock;
- dstv += dstv_pitch * hblock;
- }
- asm volatile ("mfence");
-}
-
-static void SSE_CopyFromNv12(picture_t *dst,
- uint8_t *src[2], size_t src_pitch[2],
- unsigned width, unsigned height,
- copy_cache_t *cache, unsigned cpu)
-{
- SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src[0], src_pitch[0],
- cache->buffer, cache->size,
- width, height, cpu);
- SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
- dst->p[1].p_pixels, dst->p[1].i_pitch,
- src[1], src_pitch[1],
- cache->buffer, cache->size,
- width/2, height/2, cpu);
- asm volatile ("emms");
-}
-
-static void SSE_CopyFromYv12(picture_t *dst,
- uint8_t *src[3], size_t src_pitch[3],
- unsigned width, unsigned height,
- copy_cache_t *cache, unsigned cpu)
-{
- for (unsigned n = 0; n < 3; n++) {
- const unsigned d = n > 0 ? 2 : 1;
- SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
- src[n], src_pitch[n],
- cache->buffer, cache->size,
- width/d, height/d, cpu);
- }
- asm volatile ("emms");
-}
-#undef COPY64
-#endif /* CAN_COMPILE_SSE2 */
-
-static void CopyPlane(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height)
-{
- for (unsigned y = 0; y < height; y++) {
- memcpy(dst, src, width);
- src += src_pitch;
- dst += dst_pitch;
- }
-}
-
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height)
-{
- for (unsigned y = 0; y < height; y++) {
- for (unsigned x = 0; x < width; x++) {
- dstu[x] = src[2*x+0];
- dstv[x] = src[2*x+1];
- }
- src += src_pitch;
- dstu += dstu_pitch;
- dstv += dstv_pitch;
- }
-}
-
-void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
- unsigned width, unsigned height,
- copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
- if (vlc_CPU_SSE2())
- return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
- cache, cpu);
-#else
- (void) cache;
-#endif
-
- CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src[0], src_pitch[0],
- width, height);
- SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
- dst->p[1].p_pixels, dst->p[1].i_pitch,
- src[1], src_pitch[1],
- width/2, height/2);
-}
-
-void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
- unsigned width, unsigned height,
- copy_cache_t *cache)
-{
-#ifdef CAN_COMPILE_SSE2
- unsigned cpu = vlc_CPU();
- if (vlc_CPU_SSE2())
- return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
- cache, cpu);
-#else
- (void) cache;
-#endif
-
- CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
- src[0], src_pitch[0], width, height);
- CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
- src[1], src_pitch[1], width / 2, height / 2);
- CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
- src[1], src_pitch[2], width / 2, height / 2);
-}
diff --git a/modules/codec/avcodec/copy.h b/modules/codec/avcodec/copy.h
deleted file mode 100644
index 242dd51..0000000
--- a/modules/codec/avcodec/copy.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*****************************************************************************
- * copy.h: Fast YV12/NV12 copy
- *****************************************************************************
- * Copyright (C) 2009 Laurent Aimar
- * $Id$
- *
- * Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this program; if not, write to the Free Software Foundation,
- * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
- *****************************************************************************/
-
-#ifndef _VLC_AVCODEC_COPY_H
-#define _VLC_AVCODEC_COPY_H 1
-
-typedef struct {
-# ifdef CAN_COMPILE_SSE2
- uint8_t *buffer;
- size_t size;
-# endif
-} copy_cache_t;
-
-int CopyInitCache(copy_cache_t *cache, unsigned width);
-void CopyCleanCache(copy_cache_t *cache);
-
-void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
- unsigned width, unsigned height,
- copy_cache_t *cache);
-void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
- unsigned width, unsigned height,
- copy_cache_t *cache);
-
-#endif
-
diff --git a/modules/codec/avcodec/dxva2.c b/modules/codec/avcodec/dxva2.c
index 2ef69ec..330daf0 100644
--- a/modules/codec/avcodec/dxva2.c
+++ b/modules/codec/avcodec/dxva2.c
@@ -49,7 +49,7 @@
#include "avcodec.h"
#include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
static int Open(vlc_va_t *, int, const es_format_t *);
static void Close(vlc_va_t *);
diff --git a/modules/codec/avcodec/vaapi.c b/modules/codec/avcodec/vaapi.c
index 5a1ddda..1a7b227 100644
--- a/modules/codec/avcodec/vaapi.c
+++ b/modules/codec/avcodec/vaapi.c
@@ -39,7 +39,7 @@
#include "avcodec.h"
#include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
#ifndef VA_SURFACE_ATTRIB_SETTABLE
#define vaCreateSurfaces(d, f, w, h, s, ns, a, na) \
diff --git a/modules/codec/avcodec/vda.c b/modules/codec/avcodec/vda.c
index 0bea754..7103491 100644
--- a/modules/codec/avcodec/vda.c
+++ b/modules/codec/avcodec/vda.c
@@ -34,7 +34,7 @@
#include "avcodec.h"
#include "va.h"
-#include "copy.h"
+#include "../../video_chroma/copy.h"
#include <libavcodec/vda.h>
#include <VideoDecodeAcceleration/VDADecoder.h>
diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
new file mode 100644
index 0000000..e0823aa
--- /dev/null
+++ b/modules/video_chroma/copy.c
@@ -0,0 +1,416 @@
+/*****************************************************************************
+ * copy.c: Fast YV12/NV12 copy
+ *****************************************************************************
+ * Copyright (C) 2010 Laurent Aimar
+ * $Id$
+ *
+ * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <vlc_common.h>
+#include <vlc_picture.h>
+#include <vlc_cpu.h>
+#include <assert.h>
+
+#include "copy.h"
+
+int CopyInitCache(copy_cache_t *cache, unsigned width)
+{
+#ifdef CAN_COMPILE_SSE2
+ cache->size = __MAX((width + 0x0f) & ~ 0x0f, 4096);
+ cache->buffer = vlc_memalign(16, cache->size);
+ if (!cache->buffer)
+ return VLC_EGENERIC;
+#else
+ (void) cache; (void) width;
+#endif
+ return VLC_SUCCESS;
+}
+
+void CopyCleanCache(copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+ vlc_free(cache->buffer);
+ cache->buffer = NULL;
+ cache->size = 0;
+#else
+ (void) cache;
+#endif
+}
+
+#ifdef CAN_COMPILE_SSE2
+/* Copy 64 bytes from srcp to dstp loading data with the SSE>=2 instruction
+ * load and storing data with the SSE>=2 instruction store.
+ */
+#define COPY64(dstp, srcp, load, store) \
+ asm volatile ( \
+ load " 0(%[src]), %%xmm1\n" \
+ load " 16(%[src]), %%xmm2\n" \
+ load " 32(%[src]), %%xmm3\n" \
+ load " 48(%[src]), %%xmm4\n" \
+ store " %%xmm1, 0(%[dst])\n" \
+ store " %%xmm2, 16(%[dst])\n" \
+ store " %%xmm3, 32(%[dst])\n" \
+ store " %%xmm4, 48(%[dst])\n" \
+ : : [dst]"r"(dstp), [src]"r"(srcp) : "memory", "xmm1", "xmm2", "xmm3", "xmm4")
+
+#ifndef __SSE4_1__
+# undef vlc_CPU_SSE4_1
+# define vlc_CPU_SSE4_1() ((cpu & VLC_CPU_SSE4_1) != 0)
+#endif
+
+#ifndef __SSSE3__
+# undef vlc_CPU_SSSE3
+# define vlc_CPU_SSSE3() ((cpu & VLC_CPU_SSSE3) != 0)
+#endif
+
+#ifndef __SSE2__
+# undef vlc_CPU_SSE2
+# define vlc_CPU_SSE2() ((cpu & VLC_CPU_SSE2) != 0)
+#endif
+
+/* Optimized copy from "Uncacheable Speculative Write Combining" memory
+ * as used by some video surface.
+ * XXX It is really efficient only when SSE4.1 is available.
+ */
+VLC_SSE
+static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
+ const uint8_t *src, size_t src_pitch,
+ unsigned width, unsigned height,
+ unsigned cpu)
+{
+ assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
+
+ asm volatile ("mfence");
+
+ for (unsigned y = 0; y < height; y++) {
+ const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+ unsigned x = 0;
+
+ for (; x < unaligned; x++)
+ dst[x] = src[x];
+
+#ifdef CAN_COMPILE_SSE4_1
+ if (vlc_CPU_SSE4_1()) {
+ if (!unaligned) {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
+ } else {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movntdqa", "movdqu");
+ }
+ } else
+#endif
+ {
+ if (!unaligned) {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+ } else {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+ }
+ }
+
+ for (; x < width; x++)
+ dst[x] = src[x];
+
+ src += src_pitch;
+ dst += dst_pitch;
+ }
+}
+
+VLC_SSE
+static void Copy2d(uint8_t *dst, size_t dst_pitch,
+ const uint8_t *src, size_t src_pitch,
+ unsigned width, unsigned height)
+{
+ assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
+
+ asm volatile ("mfence");
+
+ for (unsigned y = 0; y < height; y++) {
+ unsigned x = 0;
+
+ bool unaligned = ((intptr_t)dst & 0x0f) != 0;
+ if (!unaligned) {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movdqa", "movntdq");
+ } else {
+ for (; x+63 < width; x += 64)
+ COPY64(&dst[x], &src[x], "movdqa", "movdqu");
+ }
+
+ for (; x < width; x++)
+ dst[x] = src[x];
+
+ src += src_pitch;
+ dst += dst_pitch;
+ }
+}
+
+VLC_SSE
+static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch,
+ unsigned width, unsigned height, unsigned cpu)
+{
+ const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 3, 5, 7, 9, 11, 13, 15 };
+ const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
+ 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
+
+ assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
+
+ asm volatile ("mfence");
+
+ for (unsigned y = 0; y < height; y++) {
+ unsigned x = 0;
+
+#define LOAD64 \
+ "movdqa 0(%[src]), %%xmm0\n" \
+ "movdqa 16(%[src]), %%xmm1\n" \
+ "movdqa 32(%[src]), %%xmm2\n" \
+ "movdqa 48(%[src]), %%xmm3\n"
+
+#define STORE2X32 \
+ "movq %%xmm0, 0(%[dst1])\n" \
+ "movq %%xmm1, 8(%[dst1])\n" \
+ "movhpd %%xmm0, 0(%[dst2])\n" \
+ "movhpd %%xmm1, 8(%[dst2])\n" \
+ "movq %%xmm2, 16(%[dst1])\n" \
+ "movq %%xmm3, 24(%[dst1])\n" \
+ "movhpd %%xmm2, 16(%[dst2])\n" \
+ "movhpd %%xmm3, 24(%[dst2])\n"
+
+#ifdef CAN_COMPILE_SSSE3
+ if (vlc_CPU_SSSE3())
+ {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[shuffle]), %%xmm7\n"
+ LOAD64
+ "pshufb %%xmm7, %%xmm0\n"
+ "pshufb %%xmm7, %%xmm1\n"
+ "pshufb %%xmm7, %%xmm2\n"
+ "pshufb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+ }
+ } else
+#endif
+ {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[mask]), %%xmm7\n"
+ LOAD64
+ "movdqa %%xmm0, %%xmm4\n"
+ "movdqa %%xmm1, %%xmm5\n"
+ "movdqa %%xmm2, %%xmm6\n"
+ "psrlw $8, %%xmm0\n"
+ "psrlw $8, %%xmm1\n"
+ "pand %%xmm7, %%xmm4\n"
+ "pand %%xmm7, %%xmm5\n"
+ "pand %%xmm7, %%xmm6\n"
+ "packuswb %%xmm4, %%xmm0\n"
+ "packuswb %%xmm5, %%xmm1\n"
+ "pand %%xmm3, %%xmm7\n"
+ "psrlw $8, %%xmm2\n"
+ "psrlw $8, %%xmm3\n"
+ "packuswb %%xmm6, %%xmm2\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ }
+ }
+#undef STORE2X32
+#undef LOAD64
+
+ for (; x < width; x++) {
+ dstu[x] = src[2*x+0];
+ dstv[x] = src[2*x+1];
+ }
+ src += src_pitch;
+ dstu += dstu_pitch;
+ dstv += dstv_pitch;
+ }
+}
+
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+ const uint8_t *src, size_t src_pitch,
+ uint8_t *cache, size_t cache_size,
+ unsigned width, unsigned height, unsigned cpu)
+{
+ const unsigned w16 = (width+15) & ~15;
+ const unsigned hstep = cache_size / w16;
+ assert(hstep > 0);
+
+ for (unsigned y = 0; y < height; y += hstep) {
+ const unsigned hblock = __MIN(hstep, height - y);
+
+ /* Copy a bunch of line into our cache */
+ CopyFromUswc(cache, w16,
+ src, src_pitch,
+ width, hblock, cpu);
+
+ /* Copy from our cache to the destination */
+ Copy2d(dst, dst_pitch,
+ cache, w16,
+ width, hblock);
+
+ /* */
+ src += src_pitch * hblock;
+ dst += dst_pitch * hblock;
+ }
+ asm volatile ("mfence");
+}
+
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch,
+ uint8_t *cache, size_t cache_size,
+ unsigned width, unsigned height, unsigned cpu)
+{
+ const unsigned w2_16 = (2*width+15) & ~15;
+ const unsigned hstep = cache_size / w2_16;
+ assert(hstep > 0);
+
+ for (unsigned y = 0; y < height; y += hstep) {
+ const unsigned hblock = __MIN(hstep, height - y);
+
+ /* Copy a bunch of line into our cache */
+ CopyFromUswc(cache, w2_16, src, src_pitch,
+ 2*width, hblock, cpu);
+
+ /* Copy from our cache to the destination */
+ SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
+ cache, w2_16, width, hblock, cpu);
+
+ /* */
+ src += src_pitch * hblock;
+ dstu += dstu_pitch * hblock;
+ dstv += dstv_pitch * hblock;
+ }
+ asm volatile ("mfence");
+}
+
+static void SSE_CopyFromNv12(picture_t *dst,
+ uint8_t *src[2], size_t src_pitch[2],
+ unsigned width, unsigned height,
+ copy_cache_t *cache, unsigned cpu)
+{
+ SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+ src[0], src_pitch[0],
+ cache->buffer, cache->size,
+ width, height, cpu);
+ SSE_SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+ dst->p[1].p_pixels, dst->p[1].i_pitch,
+ src[1], src_pitch[1],
+ cache->buffer, cache->size,
+ width/2, height/2, cpu);
+ asm volatile ("emms");
+}
+
+static void SSE_CopyFromYv12(picture_t *dst,
+ uint8_t *src[3], size_t src_pitch[3],
+ unsigned width, unsigned height,
+ copy_cache_t *cache, unsigned cpu)
+{
+ for (unsigned n = 0; n < 3; n++) {
+ const unsigned d = n > 0 ? 2 : 1;
+ SSE_CopyPlane(dst->p[n].p_pixels, dst->p[n].i_pitch,
+ src[n], src_pitch[n],
+ cache->buffer, cache->size,
+ width/d, height/d, cpu);
+ }
+ asm volatile ("emms");
+}
+#undef COPY64
+#endif /* CAN_COMPILE_SSE2 */
+
+static void CopyPlane(uint8_t *dst, size_t dst_pitch,
+ const uint8_t *src, size_t src_pitch,
+ unsigned width, unsigned height)
+{
+ for (unsigned y = 0; y < height; y++) {
+ memcpy(dst, src, width);
+ src += src_pitch;
+ dst += dst_pitch;
+ }
+}
+
+static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch,
+ unsigned width, unsigned height)
+{
+ for (unsigned y = 0; y < height; y++) {
+ for (unsigned x = 0; x < width; x++) {
+ dstu[x] = src[2*x+0];
+ dstv[x] = src[2*x+1];
+ }
+ src += src_pitch;
+ dstu += dstu_pitch;
+ dstv += dstv_pitch;
+ }
+}
+
+void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
+ unsigned width, unsigned height,
+ copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+ unsigned cpu = vlc_CPU();
+ if (vlc_CPU_SSE2())
+ return SSE_CopyFromNv12(dst, src, src_pitch, width, height,
+ cache, cpu);
+#else
+ (void) cache;
+#endif
+
+ CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+ src[0], src_pitch[0],
+ width, height);
+ SplitPlanes(dst->p[2].p_pixels, dst->p[2].i_pitch,
+ dst->p[1].p_pixels, dst->p[1].i_pitch,
+ src[1], src_pitch[1],
+ width/2, height/2);
+}
+
+void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
+ unsigned width, unsigned height,
+ copy_cache_t *cache)
+{
+#ifdef CAN_COMPILE_SSE2
+ unsigned cpu = vlc_CPU();
+ if (vlc_CPU_SSE2())
+ return SSE_CopyFromYv12(dst, src, src_pitch, width, height,
+ cache, cpu);
+#else
+ (void) cache;
+#endif
+
+ CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+ src[0], src_pitch[0], width, height);
+ CopyPlane(dst->p[1].p_pixels, dst->p[1].i_pitch,
+ src[1], src_pitch[1], width / 2, height / 2);
+ CopyPlane(dst->p[2].p_pixels, dst->p[2].i_pitch,
+ src[1], src_pitch[2], width / 2, height / 2);
+}
diff --git a/modules/video_chroma/copy.h b/modules/video_chroma/copy.h
new file mode 100644
index 0000000..39dbf1e
--- /dev/null
+++ b/modules/video_chroma/copy.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * copy.h: Fast YV12/NV12 copy
+ *****************************************************************************
+ * Copyright (C) 2009 Laurent Aimar
+ * $Id$
+ *
+ * Authors: Laurent Aimar <fenrir_AT_ videolan _DOT_ org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifndef _VLC_VIDEOCHROMA_COPY_H
+#define _VLC_VIDEOCHROMA_COPY_H 1
+
+typedef struct {
+# ifdef CAN_COMPILE_SSE2
+ uint8_t *buffer;
+ size_t size;
+# endif
+} copy_cache_t;
+
+int CopyInitCache(copy_cache_t *cache, unsigned width);
+void CopyCleanCache(copy_cache_t *cache);
+
+void CopyFromNv12(picture_t *dst, uint8_t *src[2], size_t src_pitch[2],
+ unsigned width, unsigned height,
+ copy_cache_t *cache);
+void CopyFromYv12(picture_t *dst, uint8_t *src[3], size_t src_pitch[3],
+ unsigned width, unsigned height,
+ copy_cache_t *cache);
+
+#endif
--
1.8.3.2
More information about the vlc-devel
mailing list