[vlc-devel] [PATCH 3/6] copy: remove need for cache memory in SSE routines

Fri Jun 13 14:02:38 CEST 2014

From: Jean-Yves Avenard <jyavenard at mythtv.org>

SSE code used a 16-bytes aligned memory buffer to perform memory operations.
Rewrite code to directory work on source and destination frame, regardless of memory alignment.

This speeds up NV12->YV12 conversion by over 100% by halving the number of memory writes and reads.
---
 modules/video_chroma/copy.c | 255 ++++++++++++++++++++------------------------
 1 file changed, 114 insertions(+), 141 deletions(-)

diff --git a/modules/video_chroma/copy.c b/modules/video_chroma/copy.c
index d29843c..3c907a4 100644
--- a/modules/video_chroma/copy.c
+++ b/modules/video_chroma/copy.c
@@ -2,9 +2,11 @@
  * copy.c: Fast YV12/NV12 copy
  *****************************************************************************
  * Copyright (C) 2010 Laurent Aimar
+ * Copyright (C) 2014 Jean-Yves Avenard
  * $Id$
  *
  * Authors: Laurent Aimar <fenrir _AT_ videolan _DOT_ org>
+ * Authors: Jean-Yves Avenard <jyavenard _AT_ mythtv _DOT_ org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -28,7 +30,6 @@
 #include <vlc_common.h>
 #include <vlc_picture.h>
 #include <vlc_cpu.h>
-#include <assert.h>
 
 #include "copy.h"
 
@@ -92,17 +93,19 @@ void CopyCleanCache(copy_cache_t *cache)
  * XXX It is really efficient only when SSE4.1 is available.
  */
 VLC_SSE
-static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
-                         const uint8_t *src, size_t src_pitch,
-                         unsigned width, unsigned height,
-                         unsigned cpu)
+static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
+                          const uint8_t *src, size_t src_pitch,
+                          uint8_t *cache, size_t cache_size,
+                          unsigned width, unsigned height, unsigned cpu)
 {
-    assert(((intptr_t)dst & 0x0f) == 0 && (dst_pitch & 0x0f) == 0);
+    VLC_UNUSED(cache);
+    VLC_UNUSED(cache_size);
 
     asm volatile ("mfence");
 
     for (unsigned y = 0; y < height; y++) {
         const unsigned unaligned = (-(uintptr_t)src) & 0x0f;
+        bool dstaligned = ((intptr_t)&dst[unaligned] & 0x0f) == 0;
         unsigned x = 0;
 
         for (; x < unaligned; x++)
@@ -110,7 +113,7 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
 
 #ifdef CAN_COMPILE_SSE4_1
         if (vlc_CPU_SSE4_1()) {
-            if (!unaligned) {
+            if (dstaligned) {
                 for (; x+63 < width; x += 64)
                     COPY64(&dst[x], &src[x], "movntdqa", "movdqa");
             } else {
@@ -120,9 +123,9 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
         } else
 #endif
         {
-            if (!unaligned) {
+            if (dstaligned) {
                 for (; x+63 < width; x += 64)
-                    COPY64(&dst[x], &src[x], "movdqa", "movdqa");
+                    COPY64(&dst[x], &src[x], "movdqa", "movntdq");
             } else {
                 for (; x+63 < width; x += 64)
                     COPY64(&dst[x], &src[x], "movdqa", "movdqu");
@@ -135,62 +138,38 @@ static void CopyFromUswc(uint8_t *dst, size_t dst_pitch,
         src += src_pitch;
         dst += dst_pitch;
     }
-}
-
-VLC_SSE
-static void Copy2d(uint8_t *dst, size_t dst_pitch,
-                   const uint8_t *src, size_t src_pitch,
-                   unsigned width, unsigned height)
-{
-    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
 
     asm volatile ("mfence");
-
-    for (unsigned y = 0; y < height; y++) {
-        unsigned x = 0;
-
-        bool unaligned = ((intptr_t)dst & 0x0f) != 0;
-        if (!unaligned) {
-            for (; x+63 < width; x += 64)
-                COPY64(&dst[x], &src[x], "movdqa", "movntdq");
-        } else {
-            for (; x+63 < width; x += 64)
-                COPY64(&dst[x], &src[x], "movdqa", "movdqu");
-        }
-
-        for (; x < width; x++)
-            dst[x] = src[x];
-
-        src += src_pitch;
-        dst += dst_pitch;
-    }
 }
 
-VLC_SSE
-static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
-                        uint8_t *dstv, size_t dstv_pitch,
-                        const uint8_t *src, size_t src_pitch,
-                        unsigned width, unsigned height, unsigned cpu)
+static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
+                            uint8_t *dstv, size_t dstv_pitch,
+                            const uint8_t *src, size_t src_pitch,
+                            uint8_t *cache, size_t cache_size,
+                            unsigned width, unsigned height, unsigned cpu)
 {
-    VLC_UNUSED(cpu);
+    VLC_UNUSED(cache);
+    VLC_UNUSED(cache_size);
+
     const uint8_t shuffle[] = { 0, 2, 4, 6, 8, 10, 12, 14,
                                 1, 3, 5, 7, 9, 11, 13, 15 };
     const uint8_t mask[] = { 0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00,
                              0xff, 0x00, 0xff, 0x00, 0xff, 0x00, 0xff, 0x00 };
 
-    assert(((intptr_t)src & 0x0f) == 0 && (src_pitch & 0x0f) == 0);
-
     asm volatile ("mfence");
 
-    for (unsigned y = 0; y < height; y++) {
-        unsigned x = 0;
-
-#define LOAD64 \
+#define LOAD64A \
     "movdqa  0(%[src]), %%xmm0\n" \
     "movdqa 16(%[src]), %%xmm1\n" \
     "movdqa 32(%[src]), %%xmm2\n" \
     "movdqa 48(%[src]), %%xmm3\n"
 
+#define LOAD64U \
+    "movdqu  0(%[src]), %%xmm0\n" \
+    "movdqu 16(%[src]), %%xmm1\n" \
+    "movdqu 32(%[src]), %%xmm2\n" \
+    "movdqu 48(%[src]), %%xmm3\n"
+
 #define STORE2X32 \
     "movq   %%xmm0,   0(%[dst1])\n" \
     "movq   %%xmm1,   8(%[dst1])\n" \
@@ -201,48 +180,95 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
     "movhpd %%xmm2,  16(%[dst2])\n" \
     "movhpd %%xmm3,  24(%[dst2])\n"
 
-#ifdef CAN_COMPILE_SSSE3
-        if (vlc_CPU_SSSE3())
+    for (unsigned y = 0; y < height; y++)
+    {
+        bool aligned = ((uintptr_t)src & 0xf) == 0;
+        unsigned x = 0;
+
+        if (aligned)
         {
-            for (x = 0; x < (width & ~31); x += 32) {
-                asm volatile (
-                    "movdqu (%[shuffle]), %%xmm7\n"
-                    LOAD64
-                    "pshufb  %%xmm7, %%xmm0\n"
-                    "pshufb  %%xmm7, %%xmm1\n"
-                    "pshufb  %%xmm7, %%xmm2\n"
-                    "pshufb  %%xmm7, %%xmm3\n"
-                    STORE2X32
-                    : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
-            }
-        } else
+#ifdef CAN_COMPILE_SSSE3
+            if (vlc_CPU_SSSE3()) {
+                for (x = 0; x < (width & ~31); x += 32) {
+                    asm volatile (
+                        "movdqu (%[shuffle]), %%xmm7\n"
+                        LOAD64A
+                        "pshufb  %%xmm7, %%xmm0\n"
+                        "pshufb  %%xmm7, %%xmm1\n"
+                        "pshufb  %%xmm7, %%xmm2\n"
+                        "pshufb  %%xmm7, %%xmm3\n"
+                        STORE2X32
+                        : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+                }
+            } else
 #endif
+            {
+                for (x = 0; x < (width & ~31); x += 32) {
+                    asm volatile (
+                        "movdqu (%[mask]), %%xmm7\n"
+                        LOAD64A
+                        "movdqa   %%xmm0, %%xmm4\n"
+                        "movdqa   %%xmm1, %%xmm5\n"
+                        "movdqa   %%xmm2, %%xmm6\n"
+                        "psrlw    $8,     %%xmm0\n"
+                        "psrlw    $8,     %%xmm1\n"
+                        "pand     %%xmm7, %%xmm4\n"
+                        "pand     %%xmm7, %%xmm5\n"
+                        "pand     %%xmm7, %%xmm6\n"
+                        "packuswb %%xmm4, %%xmm0\n"
+                        "packuswb %%xmm5, %%xmm1\n"
+                        "pand     %%xmm3, %%xmm7\n"
+                        "psrlw    $8,     %%xmm2\n"
+                        "psrlw    $8,     %%xmm3\n"
+                        "packuswb %%xmm6, %%xmm2\n"
+                        "packuswb %%xmm7, %%xmm3\n"
+                        STORE2X32
+                        : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+                }
+            }
+        }
+        else
         {
-            for (x = 0; x < (width & ~31); x += 32) {
-                asm volatile (
-                    "movdqu (%[mask]), %%xmm7\n"
-                    LOAD64
-                    "movdqa   %%xmm0, %%xmm4\n"
-                    "movdqa   %%xmm1, %%xmm5\n"
-                    "movdqa   %%xmm2, %%xmm6\n"
-                    "psrlw    $8,     %%xmm0\n"
-                    "psrlw    $8,     %%xmm1\n"
-                    "pand     %%xmm7, %%xmm4\n"
-                    "pand     %%xmm7, %%xmm5\n"
-                    "pand     %%xmm7, %%xmm6\n"
-                    "packuswb %%xmm4, %%xmm0\n"
-                    "packuswb %%xmm5, %%xmm1\n"
-                    "pand     %%xmm3, %%xmm7\n"
-                    "psrlw    $8,     %%xmm2\n"
-                    "psrlw    $8,     %%xmm3\n"
-                    "packuswb %%xmm6, %%xmm2\n"
-                    "packuswb %%xmm7, %%xmm3\n"
-                    STORE2X32
-                    : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+#ifdef CAN_COMPILE_SSSE3
+            if (vlc_CPU_SSSE3()) {
+                for (x = 0; x < (width & ~31); x += 32) {
+                    asm volatile (
+                        "movdqu (%[shuffle]), %%xmm7\n"
+                        LOAD64U
+                        "pshufb  %%xmm7, %%xmm0\n"
+                        "pshufb  %%xmm7, %%xmm1\n"
+                        "pshufb  %%xmm7, %%xmm2\n"
+                        "pshufb  %%xmm7, %%xmm3\n"
+                        STORE2X32
+                        : : [dst1]"r"(&dstu[x]), [dst2]"r"(&dstv[x]), [src]"r"(&src[2*x]), [shuffle]"r"(shuffle) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm7");
+                }
+            } else
+#endif
+            {
+                for (x = 0; x < (width & ~31); x += 32) {
+                    asm volatile (
+                        "movdqu (%[mask]), %%xmm7\n"
+                        LOAD64U
+                        "movdqu   %%xmm0, %%xmm4\n"
+                        "movdqu   %%xmm1, %%xmm5\n"
+                        "movdqu   %%xmm2, %%xmm6\n"
+                        "psrlw    $8,     %%xmm0\n"
+                        "psrlw    $8,     %%xmm1\n"
+                        "pand     %%xmm7, %%xmm4\n"
+                        "pand     %%xmm7, %%xmm5\n"
+                        "pand     %%xmm7, %%xmm6\n"
+                        "packuswb %%xmm4, %%xmm0\n"
+                        "packuswb %%xmm5, %%xmm1\n"
+                        "pand     %%xmm3, %%xmm7\n"
+                        "psrlw    $8,     %%xmm2\n"
+                        "psrlw    $8,     %%xmm3\n"
+                        "packuswb %%xmm6, %%xmm2\n"
+                        "packuswb %%xmm7, %%xmm3\n"
+                        STORE2X32
+                        : : [dst2]"r"(&dstu[x]), [dst1]"r"(&dstv[x]), [src]"r"(&src[2*x]), [mask]"r"(mask) : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+                }
             }
         }
-#undef STORE2X32
-#undef LOAD64
 
         for (; x < width; x++) {
             dstu[x] = src[2*x+0];
@@ -252,64 +278,11 @@ static void SSE_SplitUV(uint8_t *dstu, size_t dstu_pitch,
         dstu += dstu_pitch;
         dstv += dstv_pitch;
     }
-}
-
-static void SSE_CopyPlane(uint8_t *dst, size_t dst_pitch,
-                          const uint8_t *src, size_t src_pitch,
-                          uint8_t *cache, size_t cache_size,
-                          unsigned width, unsigned height, unsigned cpu)
-{
-    const unsigned w16 = (width+15) & ~15;
-    const unsigned hstep = cache_size / w16;
-    assert(hstep > 0);
-
-    for (unsigned y = 0; y < height; y += hstep) {
-        const unsigned hblock =  __MIN(hstep, height - y);
-
-        /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w16,
-                     src, src_pitch,
-                     width, hblock, cpu);
-
-        /* Copy from our cache to the destination */
-        Copy2d(dst, dst_pitch,
-               cache, w16,
-               width, hblock);
-
-        /* */
-        src += src_pitch * hblock;
-        dst += dst_pitch * hblock;
-    }
     asm volatile ("mfence");
-}
-
-static void SSE_SplitPlanes(uint8_t *dstu, size_t dstu_pitch,
-                            uint8_t *dstv, size_t dstv_pitch,
-                            const uint8_t *src, size_t src_pitch,
-                            uint8_t *cache, size_t cache_size,
-                            unsigned width, unsigned height, unsigned cpu)
-{
-    const unsigned w2_16 = (2*width+15) & ~15;
-    const unsigned hstep = cache_size / w2_16;
-    assert(hstep > 0);
-
-    for (unsigned y = 0; y < height; y += hstep) {
-        const unsigned hblock =  __MIN(hstep, height - y);
-
-        /* Copy a bunch of line into our cache */
-        CopyFromUswc(cache, w2_16, src, src_pitch,
-                     2*width, hblock, cpu);
 
-        /* Copy from our cache to the destination */
-        SSE_SplitUV(dstu, dstu_pitch, dstv, dstv_pitch,
-                    cache, w2_16, width, hblock, cpu);
-
-        /* */
-        src  += src_pitch  * hblock;
-        dstu += dstu_pitch * hblock;
-        dstv += dstv_pitch * hblock;
-    }
-    asm volatile ("mfence");
+#undef STORE2X32
+#undef LOAD64U
+#undef LOAD64A
 }
 
 static void SSE_CopyFromNv12(picture_t *dst,
-- 
1.9.1