[vlc-commits] nvdec: output a custom NVDEC opaque format

Wed Sep 18 09:05:24 CEST 2019

vlc | branch: master | Steve Lhomme <robux4 at ycbcr.xyz> | Mon Sep  9 08:16:46 2019 +0200| [f05a6755d8e78103cbed5202a560771c7e914b4f] | committer: Steve Lhomme

nvdec: output a custom NVDEC opaque format

> http://git.videolan.org/gitweb.cgi/vlc.git/?a=commit;h=f05a6755d8e78103cbed5202a560771c7e914b4f
---

 modules/hw/nvdec/Makefile.am |   3 +-
 modules/hw/nvdec/nvdec.c     | 191 ++++++++++++++++++++++++++++++++++++++++++-
 modules/hw/nvdec/nvdec_fmt.h |  44 ++++++++++
 3 files changed, 235 insertions(+), 3 deletions(-)

diff --git a/modules/hw/nvdec/Makefile.am b/modules/hw/nvdec/Makefile.am
index b2f68e2562..621f876669 100644
--- a/modules/hw/nvdec/Makefile.am
+++ b/modules/hw/nvdec/Makefile.am
@@ -4,7 +4,8 @@ libnvdec_plugin_la_SOURCES = \
 	hw/nvdec/nvdec.c codec/hxxx_helper.c codec/hxxx_helper.h \
 	packetizer/hxxx_nal.h packetizer/hxxx_nal.c \
 	packetizer/h264_nal.c packetizer/h264_nal.h \
-	packetizer/hevc_nal.c packetizer/hevc_nal.h
+	packetizer/hevc_nal.c packetizer/hevc_nal.h \
+	nvdec_fmt.h
 libnvdec_plugin_la_LIBADD = $(LIBDL)
 if HAVE_NVDEC
 codec_LTLIBRARIES += libnvdec_plugin.la
diff --git a/modules/hw/nvdec/nvdec.c b/modules/hw/nvdec/nvdec.c
index 3c398b4814..cf36bf539c 100644
--- a/modules/hw/nvdec/nvdec.c
+++ b/modules/hw/nvdec/nvdec.c
@@ -28,12 +28,14 @@
 #include <vlc_plugin.h>
 #include <vlc_codec.h>
 #include <vlc_messages.h>
+#include <vlc_picture_pool.h>
 
 #define FFNV_LOG_FUNC(logctx, msg, ...)        msg_Err((vlc_object_t*)logctx, msg, __VA_ARGS__)
 #define FFNV_DEBUG_LOG_FUNC(logctx, msg, ...)  msg_Dbg((vlc_object_t*)logctx, msg, __VA_ARGS__)
 
 #include <ffnvcodec/dynlink_loader.h>
 #include "../../codec/hxxx_helper.h"
+#include "nvdec_fmt.h"
 
 static int OpenDecoder(vlc_object_t *);
 static void CloseDecoder(vlc_object_t *);
@@ -67,6 +69,7 @@ vlc_module_end ()
 /* */
 #define MAX_HXXX_SURFACES (16 + 1)
 #define NVDEC_DISPLAY_SURFACES 1
+#define MAX_POOL_SIZE     4 // number of in-flight buffers, if more are needed the decoder waits
 
 #define OUTPUT_WIDTH_ALIGN   16
 
@@ -89,6 +92,10 @@ typedef struct nvdec_ctx {
     // NVDEC doesn't stop even if HandleVideoSequence fails
     bool                        b_nvparser_success;
     size_t                      decoderHeight;
+
+    CUdeviceptr                 outputDevicePtr[MAX_POOL_SIZE];
+    unsigned int                outputPitch;
+    picture_pool_t              *out_pool;
 } nvdec_ctx_t;
 
 static inline int CudaCall(decoder_t *p_dec, CUresult result, const char *psz_func)
@@ -128,9 +135,12 @@ static vlc_fourcc_t MapSurfaceChroma(cudaVideoChromaFormat chroma, unsigned bitD
 static cudaVideoSurfaceFormat MapSurfaceFmt(int i_vlc_fourcc)
 {
     switch (i_vlc_fourcc) {
+        case VLC_CODEC_NVDEC_OPAQUE_10B:
+        case VLC_CODEC_NVDEC_OPAQUE_16B:
         case VLC_CODEC_P010:
         case VLC_CODEC_P016:
             return cudaVideoSurfaceFormat_P016;
+        case VLC_CODEC_NVDEC_OPAQUE:
         case VLC_CODEC_NV12:
             return cudaVideoSurfaceFormat_NV12;
         // case VLC_CODEC_I444:
@@ -146,7 +156,10 @@ static int CUtoFMT(video_format_t *fmt, const CUVIDEOFORMAT *p_format)
     // bit depth and chroma
     unsigned int i_bpp = p_format->bit_depth_luma_minus8 + 8;
     vlc_fourcc_t i_chroma;
-    i_chroma = MapSurfaceChroma(p_format->chroma_format, i_bpp);
+    if (is_nvdec_opaque(fmt->i_chroma))
+        i_chroma = fmt->i_chroma;
+    else
+        i_chroma = MapSurfaceChroma(p_format->chroma_format, i_bpp);
     if (i_chroma == 0)
         return VLC_EGENERIC;
 
@@ -170,6 +183,21 @@ static int CUDAAPI HandleVideoSequence(void *p_opaque, CUVIDEOFORMAT *p_format)
     nvdec_ctx_t *p_sys = p_dec->p_sys;
     int ret;
 
+    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
+    {
+        for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
+        {
+            CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+            p_sys->outputDevicePtr[i] = 0;
+        }
+
+        if (p_sys->out_pool)
+        {
+            picture_pool_Release(p_sys->out_pool);
+            p_sys->out_pool = NULL;
+        }
+    }
+
     // update vlc's output format using NVDEC parser's output
     ret = CUtoFMT(&p_dec->fmt_out.video, p_format);
     if (ret != VLC_SUCCESS)
@@ -207,6 +235,63 @@ static int CUDAAPI HandleVideoSequence(void *p_opaque, CUVIDEOFORMAT *p_format)
         goto error;
 
     // ensure the output surfaces have the same pitch so copies can work properly
+    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
+    {
+        // get the real decoder pitch
+        CUdeviceptr frameDevicePtr = 0;
+        CUVIDPROCPARAMS params = {
+            .progressive_frame = 1,
+            .top_field_first = 1,
+        };
+        ret = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, 0, &frameDevicePtr, &p_sys->outputPitch, &params );
+        if (ret != VLC_SUCCESS)
+            goto error;
+        CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
+
+        unsigned int ByteWidth = p_sys->outputPitch;
+        unsigned int Height = p_dec->fmt_out.video.i_height;
+        Height += Height / 2;
+
+        picture_t *pics[ARRAY_SIZE(p_sys->outputDevicePtr)];
+        for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
+        {
+            ret = CALL_CUDA_DEC(cuMemAlloc, &p_sys->outputDevicePtr[i], ByteWidth * Height);
+            if (ret != VLC_SUCCESS || p_sys->outputDevicePtr[i] == 0)
+                goto clean_pics;
+            picture_resource_t res = {
+                .p_sys = (void*)(uintptr_t)i,
+            };
+            pics[i] = picture_NewFromResource( &p_dec->fmt_out.video, &res );
+            if (unlikely(pics[i] == NULL))
+            {
+                msg_Dbg(p_dec, "failed to get a picture for the buffer");
+                ret = VLC_ENOMEM;
+                goto clean_pics;
+            }
+            continue;
+clean_pics:
+            if (p_sys->outputDevicePtr[i])
+            {
+                CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+                p_sys->outputDevicePtr[i] = 0;
+            }
+            if (i > 0)
+            {
+                while (i--)
+                {
+                    picture_Release(pics[i]);
+                    CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+                    p_sys->outputDevicePtr[i] = 0;
+                }
+            }
+            break;
+        }
+        if (ret != VLC_SUCCESS)
+            goto error;
+
+        p_sys->out_pool = picture_pool_New( ARRAY_SIZE(p_sys->outputDevicePtr), pics );
+    }
+
     p_sys->decoderHeight = p_format->coded_height;
 
     CALL_CUDA_DEC(cuCtxPopCurrent, NULL);
@@ -234,6 +319,28 @@ static int CUDAAPI HandlePictureDecode(void *p_opaque, CUVIDPICPARAMS *p_picpara
     return (ret == VLC_SUCCESS);
 }
 
+static void NVDecCtxDestroy(struct picture_context_t *picctx)
+{
+    pic_context_nvdec_t *srcpic = container_of(picctx, pic_context_nvdec_t, ctx);
+    free(srcpic);
+}
+
+static struct picture_context_t *NVDecCtxClone(struct picture_context_t *srcctx)
+{
+    pic_context_nvdec_t *clonectx = malloc(sizeof(*clonectx));
+    if (unlikely(clonectx == NULL))
+        return NULL;
+    pic_context_nvdec_t *srcpic = container_of(srcctx, pic_context_nvdec_t, ctx);
+
+    clonectx->ctx.destroy = NVDecCtxDestroy;
+    clonectx->ctx.copy = NVDecCtxClone;
+    clonectx->devidePtr = srcpic->devidePtr;
+    clonectx->bufferPitch = srcpic->bufferPitch;
+    clonectx->bufferHeight = srcpic->bufferHeight;
+    return &clonectx->ctx;
+}
+
+
 static int CUDAAPI HandlePictureDisplay(void *p_opaque, CUVIDPARSERDISPINFO *p_dispinfo)
 {
     decoder_t *p_dec = (decoder_t *) p_opaque;
@@ -248,6 +355,68 @@ static int CUDAAPI HandlePictureDisplay(void *p_opaque, CUVIDPARSERDISPINFO *p_d
     };
     int result;
 
+    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
+    {
+        p_pic = picture_pool_Wait(p_sys->out_pool);
+        if (unlikely(p_pic == NULL))
+            return 0;
+
+        result = CALL_CUDA_DEC(cuCtxPushCurrent, p_sys->cuCtx);
+        if (unlikely(result != VLC_SUCCESS))
+        {
+            picture_Release(p_pic);
+            return 0;
+        }
+
+        unsigned int i_pitch;
+
+        // Map decoded frame to a device pointer
+        result = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, p_dispinfo->picture_index,
+                            &frameDevicePtr, &i_pitch, &params );
+        if (result != VLC_SUCCESS)
+            goto error;
+
+        // put a new context in the output picture
+        pic_context_nvdec_t *picctx = malloc(sizeof(*picctx));
+        if (unlikely(picctx == NULL))
+            goto error;
+        picctx->ctx.destroy = NVDecCtxDestroy;
+        picctx->ctx.copy = NVDecCtxClone;
+        uintptr_t pool_idx = (uintptr_t)p_pic->p_sys;
+        picctx->devidePtr = p_sys->outputDevicePtr[pool_idx];
+        picctx->bufferPitch = p_sys->outputPitch;
+        picctx->bufferHeight = p_sys->decoderHeight;
+
+        size_t srcY = 0;
+        size_t dstY = 0;
+        for (int i_plane = 0; i_plane < 2; i_plane++) {
+            CUDA_MEMCPY2D cu_cpy = {
+                .srcMemoryType  = CU_MEMORYTYPE_DEVICE,
+                .srcDevice      = frameDevicePtr,
+                .srcY           = srcY,
+                .srcPitch       = i_pitch,
+                .dstMemoryType  = CU_MEMORYTYPE_DEVICE,
+                .dstDevice      = picctx->devidePtr,
+                .dstPitch       = picctx->bufferPitch,
+                .dstY           = dstY,
+                .WidthInBytes   = i_pitch,
+                .Height         = __MIN(picctx->bufferHeight, p_dec->fmt_out.video.i_y_offset + p_dec->fmt_out.video.i_visible_height),
+            };
+            if (i_plane == 1)
+                cu_cpy.Height >>= 1;
+            result = CALL_CUDA_DEC(cuMemcpy2DAsync, &cu_cpy, 0);
+            if (unlikely(result != VLC_SUCCESS))
+            {
+                free(picctx);
+                goto error;
+            }
+            srcY += picctx->bufferHeight;
+            dstY += p_sys->decoderHeight;
+        }
+        p_pic->context = &picctx->ctx;
+    }
+    else
+    {
     p_pic = decoder_NewPicture(p_dec);
     if (unlikely(p_pic == NULL))
         return 0;
@@ -287,6 +456,7 @@ static int CUDAAPI HandlePictureDisplay(void *p_opaque, CUVIDPARSERDISPINFO *p_d
             goto error;
          srcY += p_sys->decoderHeight;
     }
+    }
 
     // Release surface on GPU
     result = CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
@@ -313,6 +483,8 @@ static int CUDAAPI HandlePictureDisplay(void *p_opaque, CUVIDPARSERDISPINFO *p_d
     return 1;
 
 error:
+    if (frameDevicePtr)
+        CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
     CALL_CUDA_DEC(cuCtxPopCurrent, NULL);
     if (p_pic)
         picture_Release(p_pic);
@@ -688,8 +860,18 @@ static int OpenDecoder(vlc_object_t *p_this)
         goto error;
     }
 
-    vlc_fourcc_t output_chromas[2];
+    vlc_fourcc_t output_chromas[3];
     size_t chroma_idx = 0;
+    if (cudaChroma == cudaVideoChromaFormat_420)
+    {
+        if (i_depth_luma >= 16)
+            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_16B;
+        else if (i_depth_luma >= 10)
+            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_10B;
+        else
+            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE;
+    }
+
     output_chromas[chroma_idx++] = MapSurfaceChroma(cudaChroma, i_depth_luma);
     output_chromas[chroma_idx++] = 0;
 
@@ -733,6 +915,11 @@ static void CloseDecoder(vlc_object_t *p_this)
     nvdec_ctx_t *p_sys = p_dec->p_sys;
     CALL_CUDA_DEC(cuCtxPushCurrent, p_sys->cuCtx);
     CALL_CUDA_DEC(cuCtxPopCurrent, NULL);
+
+    for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
+        CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+    if (p_sys->out_pool)
+        picture_pool_Release(p_sys->out_pool);
     if (p_sys->cudecoder)
         CALL_CUVID(cuvidDestroyDecoder, p_sys->cudecoder);
     if (p_sys->cuparser)
diff --git a/modules/hw/nvdec/nvdec_fmt.h b/modules/hw/nvdec/nvdec_fmt.h
new file mode 100644
index 0000000000..bc0a2f6b88
--- /dev/null
+++ b/modules/hw/nvdec/nvdec_fmt.h
@@ -0,0 +1,44 @@
+/*****************************************************************************
+ * nvdec_fmt.h : NVDEC common code
+ *****************************************************************************
+ * Copyright © 2019 VLC authors, VideoLAN and VideoLabs
+ *
+ * Authors: Steve Lhomme <robux4 at videolabs.io>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA.
+ *****************************************************************************/
+
+#ifndef VLC_VIDEOCHROMA_NVDEC_FMT_H_
+#define VLC_VIDEOCHROMA_NVDEC_FMT_H_
+
+#include <ffnvcodec/dynlink_loader.h>
+
+static inline bool is_nvdec_opaque(vlc_fourcc_t fourcc)
+{
+    return fourcc == VLC_CODEC_NVDEC_OPAQUE ||
+           fourcc == VLC_CODEC_NVDEC_OPAQUE_10B ||
+           fourcc == VLC_CODEC_NVDEC_OPAQUE_16B;
+}
+
+/* for VLC_CODEC_NVDEC_OPAQUE / VLC_CODEC_NVDEC_OPAQUE_16B */
+typedef struct
+{
+    picture_context_t ctx;
+    CUdeviceptr  devidePtr;
+    unsigned int bufferPitch;
+    unsigned int bufferHeight;
+} pic_context_nvdec_t;
+
+#endif /* include-guard */