[vlc-commits] chroma: copy: add SP<=>P 16bits support
Thomas Guillem
git at videolan.org
Fri Nov 17 10:23:28 CET 2017
vlc | branch: master | Thomas Guillem <thomas at gllm.fr> | Thu Nov 16 18:56:09 2017 +0100| [b1ffa4467eded1af7da751f8738c412525f6c476] | committer: Thomas Guillem
chroma: copy: add SP<=>P 16bits support
Implemented without optimisation and with sse3.
> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=b1ffa4467eded1af7da751f8738c412525f6c476
---
modules/video_chroma/copy.c | 253 +++++++++++++++++++++++++++++++-------------
modules/video_chroma/copy.h | 7 ++
2 files changed, 187 insertions(+), 73 deletions(-)
diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
index d1c2693831..81387f2e61 100644
--- a/modules/video_chroma/copy.c
+++ b/modules/video_chroma/copy.c
@@ -191,7 +191,7 @@ static void
SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
uint8_t *srcu, size_t srcu_pitch,
uint8_t *srcv, size_t srcv_pitch,
- unsigned int width, unsigned int height,
+ unsigned int width, unsigned int height, uint8_t pixel_size,
unsigned int cpu)
{
assert(!((intptr_t)srcu & 0xf) && !(srcu_pitch & 0x0f) &&
@@ -201,14 +201,19 @@ SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
VLC_UNUSED(cpu);
#endif
- uint8_t const shuffle[] = { 0, 8,
- 1, 9,
- 2, 10,
- 3, 11,
- 4, 12,
- 5, 13,
- 6, 14,
- 7, 15 };
+ static const uint8_t shuffle_8[] = { 0, 8,
+ 1, 9,
+ 2, 10,
+ 3, 11,
+ 4, 12,
+ 5, 13,
+ 6, 14,
+ 7, 15 };
+ static const uint8_t shuffle_16[] = { 0, 1, 8, 9,
+ 2, 3, 10, 11,
+ 4, 5, 12, 13,
+ 6, 7, 14, 15 };
+ const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
for (unsigned int y = 0; y < height; ++y)
{
@@ -254,6 +259,7 @@ SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
#endif
{
+ assert(pixel_size == 1);
for (x = 0; x < (width & ~31); x += 32)
asm volatile
(
@@ -279,10 +285,21 @@ SSE_InterleaveUV(uint8_t *dst, size_t dst_pitch,
#undef LOAD2X32
#undef STORE64
- for (; x < width; ++x)
+ if (pixel_size == 1)
{
- dst[2*x+0] = srcu[x];
- dst[2*x+1] = srcv[x];
+ for (; x < width; x++) {
+ dst[2*x+0] = srcu[x];
+ dst[2*x+1] = srcv[x];
+ }
+ }
+ else
+ {
+ for (; x < width; x+= 2) {
+ dst[2*x+0] = srcu[x];
+ dst[2*x+1] = srcu[x + 1];
+ dst[2*x+2] = srcv[x];
+ dst[2*x+3] = srcv[x + 1];
+ }
}
srcu += srcu_pitch;
srcv += srcv_pitch;
@@ -294,21 +311,15 @@ VLC_SSE
static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
- unsigned width, unsigned height, unsigned cpu)
+ unsigned width, unsigned height, uint8_t pixel_size,
+ unsigned cpu)
{
#if defined(__SSSE3__) || !defined (CAN_COMPILE_SSSE3)
VLC_UNUSED(cpu);
#endif
- const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
- 1, 3, 5, 7, 9, 11, 13, 15 };
- const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
- 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
-
+ assert(pixel_size == 1 || pixel_size == 2);
assert(((intptr_t)src & 0xf) == 0 && (src_pitch & 0x0f) == 0);
- for (unsigned y = 0; y < height; y++) {
- unsigned x = 0;
-
#define LOAD64 \
"movdqa 0(%[src]), %%xmm0\n" \
"movdqa 16(%[src]), %%xmm1\n" \
@@ -326,9 +337,16 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
"movhpd %%xmm3, 24(%[dst2])\n"
#ifdef CAN_COMPILE_SSSE3
- if (vlc_CPU_SSSE3())
- {
- for (x = 0; x < (width & ~31); x += 32) {
+ if (vlc_CPU_SSSE3())
+ {
+ static const uint8_t shuffle_8[] = { 0, 2, 4, 6, 8, 10, 12, 14,
+ 1, 3, 5, 7, 9, 11, 13, 15 };
+ static const uint8_t shuffle_16[] = { 0, 1, 4, 5, 8, 9, 12, 13,
+ 2, 3, 6, 7, 10, 11, 14, 15 };
+ const uint8_t *shuffle = pixel_size == 1 ? shuffle_8 : shuffle_16;
+ for (unsigned y = 0; y < height; y++) {
+ unsigned x = 0;
+ for (; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[shuffle]), %%xmm7\n"
LOAD64
@@ -339,10 +357,37 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
STORE2X32
: : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
}
- } else
+ if (pixel_size == 1)
+ {
+ for (; x < width; x++) {
+ dstu[x] = src[2*x+0];
+ dstv[x] = src[2*x+1];
+ }
+ }
+ else
+ {
+ for (; x < width; x+= 2) {
+ dstu[x] = src[2*x+0];
+ dstu[x+1] = src[2*x+1];
+ dstv[x] = src[2*x+2];
+ dstv[x+1] = src[2*x+3];
+ }
+ }
+ src += src_pitch;
+ dstu += dstu_pitch;
+ dstv += dstv_pitch;
+ }
+ } else
#endif
+ {
+ assert(pixel_size == 1);
+ static const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
+ 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
+
+ for (unsigned y = 0; y < height; y++)
{
- for (x = 0; x < (width & ~31); x += 32) {
+ unsigned x = 0;
+ for (; x < (width & ~31); x += 32) {
asm volatile (
"movdqu (%[mask]), %%xmm7\n"
LOAD64
@@ -364,18 +409,17 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
STORE2X32
: : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
+ for (; x < width; x++) {
+ dstu[x] = src[2*x+0];
+ dstv[x] = src[2*x+1];
+ }
+ src += src_pitch;
+ dstu += dstu_pitch;
+ dstv += dstv_pitch;
}
+ }
#undef STORE2X32
#undef LOAD64
-
- for (; x < width; x++) {
- dstu[x] = src[2*x+0];
- dstv[x] = src[2*x+1];
- }
- src += src_pitch;
- dstu += dstu_pitch;
- dstv += dstv_pitch;
- }
}
static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
@@ -414,8 +458,7 @@ SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
const uint8_t *srcu, size_t srcu_pitch,
const uint8_t *srcv, size_t srcv_pitch,
uint8_t *cache, size_t cache_size,
- unsigned int height,
- unsigned int cpu)
+ unsigned int height, uint8_t pixel_size, unsigned int cpu)
{
assert(srcu_pitch == srcv_pitch);
unsigned int const w16 = (srcu_pitch+15) & ~15;
@@ -434,7 +477,8 @@ SSE_InterleavePlanes(uint8_t *dst, size_t dst_pitch,
/* Copy from our cache to the destination */
SSE_InterleaveUV(dst, dst_pitch, cache, w16,
- cache+w16*hblock, w16, srcu_pitch, hblock, cpu);
+ cache+w16*hblock, w16, srcu_pitch, hblock, pixel_size,
+ cpu);
/* */
srcu += hblock * srcu_pitch;
@@ -447,7 +491,7 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
uint8_t *dstv, size_t dstv_pitch,
const uint8_t *src, size_t src_pitch,
uint8_t *cache, size_t cache_size,
- unsigned height, unsigned cpu)
+ unsigned height, uint8_t pixel_size, unsigned cpu)
{
const unsigned w16 = (src_pitch+15) & ~15;
const unsigned hstep = cache_size / w16;
@@ -462,7 +506,7 @@ static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
/* Copy from our cache to the destination */
SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
- cache, w16, src_pitch / 2, hblock, cpu);
+ cache, w16, src_pitch / 2, hblock, pixel_size, cpu);
/* */
src += src_pitch * hblock;
@@ -504,7 +548,8 @@ static void SSE_Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
static void
SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned int height,
- const copy_cache_t *cache, unsigned int cpu)
+ const copy_cache_t *cache, uint8_t pixel_size,
+ unsigned int cpu)
{
SSE_CopyPlane(dest->p[0].p_pixels, dest->p[0].i_pitch,
src[0], src_pitch[0], cache->buffer, cache->size,
@@ -512,14 +557,14 @@ SSE_Copy420_SP_to_P(picture_t *dest, const uint8_t *src[static 2],
SSE_SplitPlanes(dest->p[1].p_pixels, dest->p[1].i_pitch,
dest->p[2].p_pixels, dest->p[2].i_pitch,
src[1], src_pitch[1], cache->buffer, cache->size,
- height / 2, cpu);
+ height / 2, pixel_size, cpu);
asm volatile ("emms");
}
static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
const size_t src_pitch[static 3],
unsigned height, const copy_cache_t *cache,
- unsigned cpu)
+ uint8_t pixel_size, unsigned cpu)
{
SSE_CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
src[0], src_pitch[0],
@@ -528,7 +573,7 @@ static void SSE_Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
SSE_InterleavePlanes(dst->p[1].p_pixels, dst->p[1].i_pitch,
src[U_PLANE], src_pitch[U_PLANE],
src[V_PLANE], src_pitch[V_PLANE],
- cache->buffer, cache->size, height / 2, cpu);
+ cache->buffer, cache->size, height / 2, pixel_size, cpu);
asm volatile ("emms");
}
#undef COPY64
@@ -548,22 +593,6 @@ static void CopyPlane(uint8_t *dst, size_t dst_pitch,
}
}
-static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
- uint8_t *dstv, size_t dstv_pitch,
- const uint8_t *src, size_t src_pitch,
- unsigned height)
-{
- for (unsigned y = 0; y < height; y++) {
- for (unsigned x = 0; x < src_pitch / 2; x++) {
- dstu[x] = src[2*x+0];
- dstv[x] = src[2*x+1];
- }
- src += src_pitch;
- dstu += dstu_pitch;
- dstv += dstv_pitch;
- }
-}
-
void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned height,
const copy_cache_t *cache)
@@ -584,6 +613,32 @@ void Copy420_SP_to_SP(picture_t *dst, const uint8_t *src[static 2],
src[1], src_pitch[1], height/2);
}
+#define SPLIT_PLANES(type, pitch_den) do { \
+ for (unsigned y = 0; y < height; y++) { \
+ for (unsigned x = 0; x < src_pitch / pitch_den; x++) { \
+ ((type *) dstu)[x] = ((const type *) src)[2*x+0]; \
+ ((type *) dstv)[x] = ((const type *) src)[2*x+1]; \
+ } \
+ src += src_pitch; \
+ dstu += dstu_pitch; \
+ dstv += dstv_pitch; \
+ } \
+} while(0)
+
+static void SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch, unsigned height)
+{
+ SPLIT_PLANES(uint8_t, 2);
+}
+
+static void SplitPlanes16(uint8_t *dstu, size_t dstu_pitch,
+ uint8_t *dstv, size_t dstv_pitch,
+ const uint8_t *src, size_t src_pitch, unsigned height)
+{
+ SPLIT_PLANES(uint16_t, 4);
+}
+
void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned height,
const copy_cache_t *cache)
@@ -593,7 +648,7 @@ void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
unsigned cpu = vlc_CPU();
if (vlc_CPU_SSE2())
- return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, cpu);
+ return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 1, cpu);
#else
VLC_UNUSED(cache);
#endif
@@ -605,6 +660,39 @@ void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
src[1], src_pitch[1], height/2);
}
+void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
+ const size_t src_pitch[static 2], unsigned height,
+ const copy_cache_t *cache)
+{
+ ASSERT_2PLANES;
+#ifdef CAN_COMPILE_SSE3
+ unsigned cpu = vlc_CPU();
+
+ if (vlc_CPU_SSE3())
+ return SSE_Copy420_SP_to_P(dst, src, src_pitch, height, cache, 2, cpu);
+#else
+ VLC_UNUSED(cache);
+#endif
+
+ CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+ src[0], src_pitch[0], height);
+ SplitPlanes16(dst->p[1].p_pixels, dst->p[1].i_pitch,
+ dst->p[2].p_pixels, dst->p[2].i_pitch,
+ src[1], src_pitch[1], height/2);
+}
+
+#define INTERLEAVE_UV() do { \
+ for ( unsigned int line = 0; line < copy_lines; line++ ) { \
+ for ( unsigned int col = 0; col < copy_pitch; col++ ) { \
+ *dstUV++ = *srcU++; \
+ *dstUV++ = *srcV++; \
+ } \
+ dstUV += i_extra_pitch_uv; \
+ srcU += i_extra_pitch_u; \
+ srcV += i_extra_pitch_v; \
+ } \
+}while(0)
+
void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
const size_t src_pitch[static 3], unsigned height,
const copy_cache_t *cache)
@@ -613,7 +701,7 @@ void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
#ifdef CAN_COMPILE_SSE2
unsigned cpu = vlc_CPU();
if (vlc_CPU_SSE2())
- return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, cpu);
+ return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 1, cpu);
#else
(void) cache;
#endif
@@ -631,17 +719,36 @@ void Copy420_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
uint8_t *dstUV = dst->p[1].p_pixels;
const uint8_t *srcU = src[U_PLANE];
const uint8_t *srcV = src[V_PLANE];
- for ( unsigned int line = 0; line < copy_lines; line++ )
- {
- for ( unsigned int col = 0; col < copy_pitch; col++ )
- {
- *dstUV++ = *srcU++;
- *dstUV++ = *srcV++;
- }
- dstUV += i_extra_pitch_uv;
- srcU += i_extra_pitch_u;
- srcV += i_extra_pitch_v;
- }
+ INTERLEAVE_UV();
+}
+
+void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
+ const size_t src_pitch[static 3], unsigned height,
+ const copy_cache_t *cache)
+{
+ ASSERT_3PLANES;
+#ifdef CAN_COMPILE_SSE2
+ unsigned cpu = vlc_CPU();
+ if (vlc_CPU_SSE3())
+ return SSE_Copy420_P_to_SP(dst, src, src_pitch, height, cache, 2, cpu);
+#else
+ (void) cache;
+#endif
+
+ CopyPlane(dst->p[0].p_pixels, dst->p[0].i_pitch,
+ src[0], src_pitch[0], height);
+
+ const unsigned copy_lines = height / 2;
+ const unsigned copy_pitch = src_pitch[1] / 2;
+
+ const int i_extra_pitch_uv = dst->p[1].i_pitch / 2 - 2 * copy_pitch;
+ const int i_extra_pitch_u = src_pitch[U_PLANE] / 2 - copy_pitch;
+ const int i_extra_pitch_v = src_pitch[V_PLANE] / 2 - copy_pitch;
+
+ uint16_t *dstUV = (void*) dst->p[1].p_pixels;
+ const uint16_t *srcU = (const uint16_t *) src[U_PLANE];
+ const uint16_t *srcV = (const uint16_t *) src[V_PLANE];
+ INTERLEAVE_UV();
}
void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
diff --git a/modules/video_chroma/copy.h b/modules/video_chroma/copy.h
index 77278615d5..f332f6d429 100644
--- a/modules/video_chroma/copy.h
+++ b/modules/video_chroma/copy.h
@@ -56,6 +56,13 @@ void Copy420_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
const size_t src_pitch[static 2], unsigned height,
const copy_cache_t *cache);
+void Copy420_16_P_to_SP(picture_t *dst, const uint8_t *src[static 3],
+ const size_t src_pitch[static 3], unsigned height,
+ const copy_cache_t *cache);
+
+void Copy420_16_SP_to_P(picture_t *dst, const uint8_t *src[static 2],
+ const size_t src_pitch[static 2], unsigned height,
+ const copy_cache_t *cache);
/* XXX: Not optimized copy (no SEE) */
void CopyFromI420_10ToP010(picture_t *dst, const uint8_t *src[static 3],
More information about the vlc-commits
mailing list