[vlc-devel] [PATCH 3/6] copy: remove need for cache memory in SSE routines
Jean-Yves Avenard
jyavenard at gmail.com
Fri Jun 13 14:02:38 CEST 2014
From: Jean-Yves Avenard <jyavenard at mythtv.org>
SSE code used a 16-bytes aligned memory buffer to perform memory operations.
Rewrite code to directory work on source and destination frame, regardless of memory alignment.
This speeds up NV12->YV12 conversion by over 100% by halving the number of memory writes and reads.
---
modules/video_chroma/copy.c | 255 ++++++++++++++++++++------------------------
1 file changed, 114 insertions(+), 141 deletions(-)
diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
index d29843c..3c907a4 100644
--- a/modules/video_chroma/copy.c
+++ b/modules/video_chroma/copy.c
@@ -2,9 +2,11 @@
* copy.c: Fast YV12/NV12 copy
*****************************************************************************
* Copyright (C) 2010 Laurent Aimar
+ * Copyright (C) 2014 Jean-Yves Avenard
* $Id$
*
* Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
+ * Authors: Jean-Yves Avenard <jyavenard _AT_ mythtv _DOT_ org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
@@ -28,7 +30,6 @@
#include <vlc_common.h>
#include <vlc_picture.h>
#include <vlc_cpu.h>
-#include <assert.h>
#include "copy.h"
@@ -92,17 +93,19 @@ void CopyCleanCache(copy_cache_t *cache)
* XXX It is really efficient only when SSE4.1 is available.
*/
VLC_SSE
-static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height,
- unsigned cpu)
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+ const uint8_t *src, size_t src_pitch,
+ uint8_t *cache, size_t cache_size,
+ unsigned width, unsigned height, unsigned cpu)
{
- assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
+ VLC_UNUSED(cache);
+ VLC_UNUSED(cache_size);
asm volatile ("mfence");
for (unsigned y = 0; y < height; y++) {
const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+ bool dstaligned = ((intptr_t)&dst[unaligned] & 0x0f) == 0;
unsigned x = 0;
for (; x < unaligned; x++)
@@ -110,7 +113,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
#ifdef CAN_COMPILE_SSE4_1
if (vlc_CPU_SSE4_1()) {
- if (!unaligned) {
+ if (dstaligned) {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
} else {
@@ -120,9 +123,9 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
} else
#endif
{
- if (!unaligned) {
+ if (dstaligned) {
for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+ COPY64(&dst[x], &src[x], "movdqa", "movntdq");
} else {
for (; x+63 < width; x += 64)
COPY64(&dst[x], &src[x], "movdqa", "movdqu");
@@ -135,62 +138,38 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
src += src_pitch;
dst += dst_pitch;
}
-}
-
-VLC_SSE
-static void Copy2d(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height)
-{
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
asm volatile ("mfence");
-
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
- bool unaligned = ((intptr_t)dst & 0x0f) != 0;
- if (!unaligned) {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movntdq");
- } else {
- for (; x+63 < width; x += 64)
- COPY64(&dst[x], &src[x], "movdqa", "movdqu");
- }
-
- for (; x < width; x++)
- dst[x] = src[x];
-
- src += src_pitch;
- dst += dst_pitch;
- }
}
-VLC_SSE
-static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height, unsigned cpu)
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch,
+ uint8_t *cache, size_t cache_size,
+ unsigned width, unsigned height, unsigned cpu)
{
- VLC_UNUSED(cpu);
+ VLC_UNUSED(cache);
+ VLC_UNUSED(cache_size);
+
const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
1, 3, 5, 7, 9, 11, 13, 15 };
const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
- assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
asm volatile ("mfence");
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
-#define LOAD64 \
+#define LOAD64A \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
"movdqa 32(%[src]), %%xmm2\n" \
"movdqa 48(%[src]), %%xmm3\n"
+#define LOAD64U \
+ "movdqu 0(%[src]), %%xmm0\n" \
+ "movdqu 16(%[src]), %%xmm1\n" \
+ "movdqu 32(%[src]), %%xmm2\n" \
+ "movdqu 48(%[src]), %%xmm3\n"
+
#define STORE2X32 \
"movq %%xmm0, 0(%[dst1])\n" \
"movq %%xmm1, 8(%[dst1])\n" \
@@ -201,48 +180,95 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
"movhpd %%xmm2, 16(%[dst2])\n" \
"movhpd %%xmm3, 24(%[dst2])\n"
-#ifdef CAN_COMPILE_SSSE3
- if (vlc_CPU_SSSE3())
+ for (unsigned y = 0; y < height; y++)
+ {
+ bool aligned = ((uintptr_t)src & 0xf) == 0;
+ unsigned x = 0;
+
+ if (aligned)
{
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[shuffle]), %%xmm7\n"
- LOAD64
- "pshufb %%xmm7, %%xmm0\n"
- "pshufb %%xmm7, %%xmm1\n"
- "pshufb %%xmm7, %%xmm2\n"
- "pshufb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
- }
- } else
+#ifdef CAN_COMPILE_SSSE3
+ if (vlc_CPU_SSSE3()) {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[shuffle]), %%xmm7\n"
+ LOAD64A
+ "pshufb %%xmm7, %%xmm0\n"
+ "pshufb %%xmm7, %%xmm1\n"
+ "pshufb %%xmm7, %%xmm2\n"
+ "pshufb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+ }
+ } else
#endif
+ {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[mask]), %%xmm7\n"
+ LOAD64A
+ "movdqa %%xmm0, %%xmm4\n"
+ "movdqa %%xmm1, %%xmm5\n"
+ "movdqa %%xmm2, %%xmm6\n"
+ "psrlw $8, %%xmm0\n"
+ "psrlw $8, %%xmm1\n"
+ "pand %%xmm7, %%xmm4\n"
+ "pand %%xmm7, %%xmm5\n"
+ "pand %%xmm7, %%xmm6\n"
+ "packuswb %%xmm4, %%xmm0\n"
+ "packuswb %%xmm5, %%xmm1\n"
+ "pand %%xmm3, %%xmm7\n"
+ "psrlw $8, %%xmm2\n"
+ "psrlw $8, %%xmm3\n"
+ "packuswb %%xmm6, %%xmm2\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ }
+ }
+ }
+ else
{
- for (x = 0; x < (width & ~31); x += 32) {
- asm volatile (
- "movdqu (%[mask]), %%xmm7\n"
- LOAD64
- "movdqa %%xmm0, %%xmm4\n"
- "movdqa %%xmm1, %%xmm5\n"
- "movdqa %%xmm2, %%xmm6\n"
- "psrlw $8, %%xmm0\n"
- "psrlw $8, %%xmm1\n"
- "pand %%xmm7, %%xmm4\n"
- "pand %%xmm7, %%xmm5\n"
- "pand %%xmm7, %%xmm6\n"
- "packuswb %%xmm4, %%xmm0\n"
- "packuswb %%xmm5, %%xmm1\n"
- "pand %%xmm3, %%xmm7\n"
- "psrlw $8, %%xmm2\n"
- "psrlw $8, %%xmm3\n"
- "packuswb %%xmm6, %%xmm2\n"
- "packuswb %%xmm7, %%xmm3\n"
- STORE2X32
- : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+#ifdef CAN_COMPILE_SSSE3
+ if (vlc_CPU_SSSE3()) {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[shuffle]), %%xmm7\n"
+ LOAD64U
+ "pshufb %%xmm7, %%xmm0\n"
+ "pshufb %%xmm7, %%xmm1\n"
+ "pshufb %%xmm7, %%xmm2\n"
+ "pshufb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+ }
+ } else
+#endif
+ {
+ for (x = 0; x < (width & ~31); x += 32) {
+ asm volatile (
+ "movdqu (%[mask]), %%xmm7\n"
+ LOAD64U
+ "movdqu %%xmm0, %%xmm4\n"
+ "movdqu %%xmm1, %%xmm5\n"
+ "movdqu %%xmm2, %%xmm6\n"
+ "psrlw $8, %%xmm0\n"
+ "psrlw $8, %%xmm1\n"
+ "pand %%xmm7, %%xmm4\n"
+ "pand %%xmm7, %%xmm5\n"
+ "pand %%xmm7, %%xmm6\n"
+ "packuswb %%xmm4, %%xmm0\n"
+ "packuswb %%xmm5, %%xmm1\n"
+ "pand %%xmm3, %%xmm7\n"
+ "psrlw $8, %%xmm2\n"
+ "psrlw $8, %%xmm3\n"
+ "packuswb %%xmm6, %%xmm2\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ STORE2X32
+ : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+ }
}
}
-#undef STORE2X32
-#undef LOAD64
for (; x < width; x++) {
dstu[x] = src[2*x+0];
@@ -252,64 +278,11 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
dstu += dstu_pitch;
dstv += dstv_pitch;
}
-}
-
-static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
- const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height, unsigned cpu)
-{
- const unsigned w16 = (width+15) & ~15;
- const unsigned hstep = cache_size / w16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w16,
- src, src_pitch,
- width, hblock, cpu);
-
- /* Copy from our cache to the destination */
- Copy2d(dst, dst_pitch,
- cache, w16,
- width, hblock);
-
- /* */
- src += src_pitch * hblock;
- dst += dst_pitch * hblock;
- }
asm volatile ("mfence");
-}
-
-static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- uint8_t *cache, size_t cache_size,
- unsigned width, unsigned height, unsigned cpu)
-{
- const unsigned w2_16 = (2*width+15) & ~15;
- const unsigned hstep = cache_size / w2_16;
- assert(hstep > 0);
-
- for (unsigned y = 0; y < height; y += hstep) {
- const unsigned hblock = __MIN(hstep, height - y);
-
- /* Copy a bunch of line into our cache */
- CopyFromUswc(cache, w2_16, src, src_pitch,
- 2*width, hblock, cpu);
- /* Copy from our cache to the destination */
- SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
- cache, w2_16, width, hblock, cpu);
-
- /* */
- src += src_pitch * hblock;
- dstu += dstu_pitch * hblock;
- dstv += dstv_pitch * hblock;
- }
- asm volatile ("mfence");
+#undef STORE2X32
+#undef LOAD64U
+#undef LOAD64A
}
static void SSE_CopyFromNv12(picture_t *dst,
--
1.9.1
More information about the vlc-devel
mailing list