[vlc-devel] [PATCH] nvdec: removed CPU chroma fallback

quentin.chateau at deepskycorp.com quentin.chateau at deepskycorp.com
Tue Mar 3 11:46:20 CET 2020


From: Quentin Chateau <quentin.chateau at deepskycorp.com>

A GPU to CPU video filter is available
---
 modules/hw/nvdec/nvdec.c     | 379 +++++++++++++----------------------
 modules/hw/nvdec/nvdec_fmt.h |   9 -
 modules/hw/nvdec/nvdec_gl.c  |   3 -
 3 files changed, 144 insertions(+), 247 deletions(-)

diff --git a/modules/hw/nvdec/nvdec.c b/modules/hw/nvdec/nvdec.c
index e5cda56ef3..72875c5ac7 100644
--- a/modules/hw/nvdec/nvdec.c
+++ b/modules/hw/nvdec/nvdec.c
@@ -113,14 +113,14 @@ static vlc_fourcc_t MapSurfaceChroma(cudaVideoChromaFormat chroma, unsigned bitD
     switch (chroma) {
         case cudaVideoChromaFormat_420:
             if (bitDepth <= 8)
-                return VLC_CODEC_NV12;
+                return VLC_CODEC_NVDEC_OPAQUE;
             if (bitDepth <= 10)
-                return VLC_CODEC_P010;
-            return VLC_CODEC_P016;
+                return VLC_CODEC_NVDEC_OPAQUE_10B;
+            return VLC_CODEC_NVDEC_OPAQUE_16B;
         case cudaVideoChromaFormat_444:
             if (bitDepth <= 8)
-                return VLC_CODEC_I444;
-            return VLC_CODEC_I444_16L;
+                return VLC_CODEC_NVDEC_OPAQUE_444;
+            return VLC_CODEC_NVDEC_OPAQUE_444_16B;
         default:
             return 0;
     }
@@ -131,19 +131,15 @@ static cudaVideoSurfaceFormat MapSurfaceFmt(int i_vlc_fourcc)
     switch (i_vlc_fourcc) {
         case VLC_CODEC_NVDEC_OPAQUE_10B:
         case VLC_CODEC_NVDEC_OPAQUE_16B:
-        case VLC_CODEC_P010:
-        case VLC_CODEC_P016:
             return cudaVideoSurfaceFormat_P016;
         case VLC_CODEC_NVDEC_OPAQUE:
-        case VLC_CODEC_NV12:
             return cudaVideoSurfaceFormat_NV12;
         case VLC_CODEC_NVDEC_OPAQUE_444:
-        case VLC_CODEC_I444:
             return cudaVideoSurfaceFormat_YUV444;
         case VLC_CODEC_NVDEC_OPAQUE_444_16B:
-        case VLC_CODEC_I444_16L:
-             return cudaVideoSurfaceFormat_YUV444_16Bit;
-        default:             vlc_assert_unreachable();
+            return cudaVideoSurfaceFormat_YUV444_16Bit;
+        default:
+            vlc_assert_unreachable();
     }
 }
 
@@ -151,15 +147,6 @@ static int CUtoFMT(video_format_t *fmt, const CUVIDEOFORMAT *p_format)
 {
     // bit depth and chroma
     unsigned int i_bpp = p_format->bit_depth_luma_minus8 + 8;
-    vlc_fourcc_t i_chroma;
-    if (is_nvdec_opaque(fmt->i_chroma))
-        i_chroma = fmt->i_chroma;
-    else
-        i_chroma = MapSurfaceChroma(p_format->chroma_format, i_bpp);
-    if (i_chroma == 0)
-        return VLC_EGENERIC;
-
-    fmt->i_chroma = i_chroma;
     // use the real padded size when we know it fmt->i_width = p_format->coded_width;
     fmt->i_height = p_format->coded_height;
     fmt->i_x_offset = p_format->display_area.left;
@@ -179,19 +166,16 @@ static int CUDAAPI HandleVideoSequence(void *p_opaque, CUVIDEOFORMAT *p_format)
     nvdec_ctx_t *p_sys = p_dec->p_sys;
     int ret;
 
-    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
+    for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
     {
-        for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
-        {
-            CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
-            p_sys->outputDevicePtr[i] = 0;
-        }
+        CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+        p_sys->outputDevicePtr[i] = 0;
+    }
 
-        if (p_sys->out_pool)
-        {
-            picture_pool_Release(p_sys->out_pool);
-            p_sys->out_pool = NULL;
-        }
+    if (p_sys->out_pool)
+    {
+        picture_pool_Release(p_sys->out_pool);
+        p_sys->out_pool = NULL;
     }
 
     // update vlc's output format using NVDEC parser's output
@@ -231,75 +215,71 @@ static int CUDAAPI HandleVideoSequence(void *p_opaque, CUVIDEOFORMAT *p_format)
         goto error;
 
     // ensure the output surfaces have the same pitch so copies can work properly
-    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
+    // get the real decoder pitch
+    CUdeviceptr frameDevicePtr = 0;
+    CUVIDPROCPARAMS params = {
+        .progressive_frame = 1,
+        .top_field_first = 1,
+    };
+    ret = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, 0, &frameDevicePtr, &p_sys->outputPitch, &params );
+    if (ret != VLC_SUCCESS)
+        goto error;
+    CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
+
+    unsigned int ByteWidth = p_sys->outputPitch;
+    unsigned int Height = p_dec->fmt_out.video.i_height;
+    switch (dparams.OutputFormat)
     {
-        // get the real decoder pitch
-        CUdeviceptr frameDevicePtr = 0;
-        CUVIDPROCPARAMS params = {
-            .progressive_frame = 1,
-            .top_field_first = 1,
-        };
-        ret = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, 0, &frameDevicePtr, &p_sys->outputPitch, &params );
-        if (ret != VLC_SUCCESS)
-            goto error;
-        CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
+        case cudaVideoSurfaceFormat_YUV444:
+        case cudaVideoSurfaceFormat_YUV444_16Bit:
+            Height += 2 * Height; // 3 planes
+            break;
+        case cudaVideoSurfaceFormat_NV12:
+        case cudaVideoSurfaceFormat_P016:
+            Height += Height / 2; // U and V at quarter resolution
+            break;
+        default:
+            vlc_assert_unreachable();
+    }
 
-        unsigned int ByteWidth = p_sys->outputPitch;
-        unsigned int Height = p_dec->fmt_out.video.i_height;
-        switch (dparams.OutputFormat)
+    picture_t *pics[ARRAY_SIZE(p_sys->outputDevicePtr)];
+    for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
+    {
+        ret = CALL_CUDA_DEC(cuMemAlloc, &p_sys->outputDevicePtr[i], ByteWidth * Height);
+        if (ret != VLC_SUCCESS || p_sys->outputDevicePtr[i] == 0)
+            goto clean_pics;
+        picture_resource_t res = {
+            .p_sys = (void*)(uintptr_t)i,
+        };
+        pics[i] = picture_NewFromResource( &p_dec->fmt_out.video, &res );
+        if (unlikely(pics[i] == NULL))
         {
-            case cudaVideoSurfaceFormat_YUV444:
-            case cudaVideoSurfaceFormat_YUV444_16Bit:
-                Height += 2 * Height; // 3 planes
-                break;
-            case cudaVideoSurfaceFormat_NV12:
-            case cudaVideoSurfaceFormat_P016:
-                Height += Height / 2; // U and V at quarter resolution
-                break;
-            default:
-                vlc_assert_unreachable();
+            msg_Dbg(p_dec, "failed to get a picture for the buffer");
+            ret = VLC_ENOMEM;
+            goto clean_pics;
         }
-
-        picture_t *pics[ARRAY_SIZE(p_sys->outputDevicePtr)];
-        for (size_t i=0; i < ARRAY_SIZE(p_sys->outputDevicePtr); i++)
-        {
-            ret = CALL_CUDA_DEC(cuMemAlloc, &p_sys->outputDevicePtr[i], ByteWidth * Height);
-            if (ret != VLC_SUCCESS || p_sys->outputDevicePtr[i] == 0)
-                goto clean_pics;
-            picture_resource_t res = {
-                .p_sys = (void*)(uintptr_t)i,
-            };
-            pics[i] = picture_NewFromResource( &p_dec->fmt_out.video, &res );
-            if (unlikely(pics[i] == NULL))
-            {
-                msg_Dbg(p_dec, "failed to get a picture for the buffer");
-                ret = VLC_ENOMEM;
-                goto clean_pics;
-            }
-            continue;
+        continue;
 clean_pics:
-            if (p_sys->outputDevicePtr[i])
+        if (p_sys->outputDevicePtr[i])
+        {
+            CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
+            p_sys->outputDevicePtr[i] = 0;
+        }
+        if (i > 0)
+        {
+            while (i--)
             {
+                picture_Release(pics[i]);
                 CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
                 p_sys->outputDevicePtr[i] = 0;
             }
-            if (i > 0)
-            {
-                while (i--)
-                {
-                    picture_Release(pics[i]);
-                    CALL_CUDA_DEC(cuMemFree, p_sys->outputDevicePtr[i]);
-                    p_sys->outputDevicePtr[i] = 0;
-                }
-            }
-            break;
         }
-        if (ret != VLC_SUCCESS)
-            goto error;
-
-        p_sys->out_pool = picture_pool_New( ARRAY_SIZE(p_sys->outputDevicePtr), pics );
+        break;
     }
+    if (ret != VLC_SUCCESS)
+        goto error;
 
+    p_sys->out_pool = picture_pool_New( ARRAY_SIZE(p_sys->outputDevicePtr), pics );
     p_sys->decoderHeight = p_format->coded_height;
 
     CALL_CUDA_DEC(cuCtxPopCurrent, NULL);
@@ -360,140 +340,96 @@ static int CUDAAPI HandlePictureDisplay(void *p_opaque, CUVIDPARSERDISPINFO *p_d
     };
     int result;
 
-    if ( is_nvdec_opaque(p_dec->fmt_out.video.i_chroma) )
-    {
-        p_pic = picture_pool_Wait(p_sys->out_pool);
-        if (unlikely(p_pic == NULL))
-            return 0;
+    p_pic = picture_pool_Wait(p_sys->out_pool);
+    if (unlikely(p_pic == NULL))
+        return 0;
 
-        result = CALL_CUDA_DEC(cuCtxPushCurrent, p_sys->cuCtx);
-        if (unlikely(result != VLC_SUCCESS))
-        {
-            picture_Release(p_pic);
-            return 0;
-        }
+    result = CALL_CUDA_DEC(cuCtxPushCurrent, p_sys->cuCtx);
+    if (unlikely(result != VLC_SUCCESS))
+    {
+        picture_Release(p_pic);
+        return 0;
+    }
 
-        unsigned int i_pitch;
+    unsigned int i_pitch;
 
-        // Map decoded frame to a device pointer
-        result = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, p_dispinfo->picture_index,
-                            &frameDevicePtr, &i_pitch, &params );
-        if (result != VLC_SUCCESS)
-            goto error;
+    // Map decoded frame to a device pointer
+    result = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, p_dispinfo->picture_index,
+                        &frameDevicePtr, &i_pitch, &params );
+    if (result != VLC_SUCCESS)
+        goto error;
 
-        // put a new context in the output picture
-        pic_context_nvdec_t *picctx = malloc(sizeof(*picctx));
-        if (unlikely(picctx == NULL))
-            goto error;
-        picctx->ctx = (picture_context_t) {
-            NVDecCtxDestroy, NVDecCtxClone,
-            p_sys->vctx_out,
-        };
-        uintptr_t pool_idx = (uintptr_t)p_pic->p_sys;
-        picctx->devicePtr = p_sys->outputDevicePtr[pool_idx];
-        picctx->bufferPitch = p_sys->outputPitch;
-        picctx->bufferHeight = p_sys->decoderHeight;
-
-        size_t srcY = 0;
-        size_t dstY = 0;
-        if (p_pic->format.i_chroma == VLC_CODEC_NVDEC_OPAQUE_444 || p_pic->format.i_chroma == VLC_CODEC_NVDEC_OPAQUE_444_16B)
-        {
-            for (int i_plane = 0; i_plane < 3; i_plane++) {
-                CUDA_MEMCPY2D cu_cpy = {
-                    .srcMemoryType  = CU_MEMORYTYPE_DEVICE,
-                    .srcDevice      = frameDevicePtr,
-                    .srcY           = srcY,
-                    .srcPitch       = i_pitch,
-                    .dstMemoryType  = CU_MEMORYTYPE_DEVICE,
-                    .dstDevice      = picctx->devicePtr,
-                    .dstPitch       = picctx->bufferPitch,
-                    .dstY           = dstY,
-                    .WidthInBytes   = i_pitch,
-                    .Height         = __MIN(picctx->bufferHeight, p_dec->fmt_out.video.i_y_offset + p_dec->fmt_out.video.i_visible_height),
-                };
-                result = CALL_CUDA_DEC(cuMemcpy2DAsync, &cu_cpy, 0);
-                if (unlikely(result != VLC_SUCCESS))
-                {
-                    free(picctx);
-                    goto error;
-                }
-                srcY += picctx->bufferHeight;
-                dstY += p_sys->decoderHeight;
-            }
-        }
-        else
-        {
-            for (int i_plane = 0; i_plane < 2; i_plane++) {
-                CUDA_MEMCPY2D cu_cpy = {
-                    .srcMemoryType  = CU_MEMORYTYPE_DEVICE,
-                    .srcDevice      = frameDevicePtr,
-                    .srcY           = srcY,
-                    .srcPitch       = i_pitch,
-                    .dstMemoryType  = CU_MEMORYTYPE_DEVICE,
-                    .dstDevice      = picctx->devicePtr,
-                    .dstPitch       = picctx->bufferPitch,
-                    .dstY           = dstY,
-                    .WidthInBytes   = i_pitch,
-                    .Height         = __MIN(picctx->bufferHeight, p_dec->fmt_out.video.i_y_offset + p_dec->fmt_out.video.i_visible_height),
-                };
-                if (i_plane == 1)
-                    cu_cpy.Height >>= 1;
-                result = CALL_CUDA_DEC(cuMemcpy2DAsync, &cu_cpy, 0);
-                if (unlikely(result != VLC_SUCCESS))
-                {
-                    free(picctx);
-                    goto error;
-                }
-                srcY += picctx->bufferHeight;
-                dstY += p_sys->decoderHeight;
+    // put a new context in the output picture
+    pic_context_nvdec_t *picctx = malloc(sizeof(*picctx));
+    if (unlikely(picctx == NULL))
+        goto error;
+    picctx->ctx = (picture_context_t) {
+        NVDecCtxDestroy, NVDecCtxClone,
+        p_sys->vctx_out,
+    };
+    uintptr_t pool_idx = (uintptr_t)p_pic->p_sys;
+    picctx->devicePtr = p_sys->outputDevicePtr[pool_idx];
+    picctx->bufferPitch = p_sys->outputPitch;
+    picctx->bufferHeight = p_sys->decoderHeight;
+
+    size_t srcY = 0;
+    size_t dstY = 0;
+    if (p_pic->format.i_chroma == VLC_CODEC_NVDEC_OPAQUE_444 || p_pic->format.i_chroma == VLC_CODEC_NVDEC_OPAQUE_444_16B)
+    {
+        for (int i_plane = 0; i_plane < 3; i_plane++) {
+            CUDA_MEMCPY2D cu_cpy = {
+                .srcMemoryType  = CU_MEMORYTYPE_DEVICE,
+                .srcDevice      = frameDevicePtr,
+                .srcY           = srcY,
+                .srcPitch       = i_pitch,
+                .dstMemoryType  = CU_MEMORYTYPE_DEVICE,
+                .dstDevice      = picctx->devicePtr,
+                .dstPitch       = picctx->bufferPitch,
+                .dstY           = dstY,
+                .WidthInBytes   = i_pitch,
+                .Height         = __MIN(picctx->bufferHeight, p_dec->fmt_out.video.i_y_offset + p_dec->fmt_out.video.i_visible_height),
+            };
+            result = CALL_CUDA_DEC(cuMemcpy2DAsync, &cu_cpy, 0);
+            if (unlikely(result != VLC_SUCCESS))
+            {
+                free(picctx);
+                goto error;
             }
+            srcY += picctx->bufferHeight;
+            dstY += p_sys->decoderHeight;
         }
-        p_pic->context = &picctx->ctx;
-        vlc_video_context_Hold(picctx->ctx.vctx);
     }
     else
     {
-        p_pic = decoder_NewPicture(p_dec);
-        if (unlikely(p_pic == NULL))
-            return 0;
-
-        result = CALL_CUDA_DEC(cuCtxPushCurrent, p_sys->cuCtx);
-        if (unlikely(result != VLC_SUCCESS))
-        {
-            picture_Release(p_pic);
-            return 0;
-        }
-
-        unsigned int i_pitch;
-
-        // Map decoded frame to a device pointer
-        result = CALL_CUVID( cuvidMapVideoFrame, p_sys->cudecoder, p_dispinfo->picture_index,
-                            &frameDevicePtr, &i_pitch, &params );
-        if (result != VLC_SUCCESS)
-            goto error;
-
-        // Copy decoded frame into a new VLC picture
-        size_t srcY = 0;
-        for (int i_plane = 0; i_plane < p_pic->i_planes; i_plane++) {
-            plane_t plane = p_pic->p[i_plane];
+        for (int i_plane = 0; i_plane < 2; i_plane++) {
             CUDA_MEMCPY2D cu_cpy = {
                 .srcMemoryType  = CU_MEMORYTYPE_DEVICE,
                 .srcDevice      = frameDevicePtr,
                 .srcY           = srcY,
                 .srcPitch       = i_pitch,
-                .dstMemoryType  = CU_MEMORYTYPE_HOST,
-                .dstHost        = plane.p_pixels,
-                .dstPitch       = plane.i_pitch,
+                .dstMemoryType  = CU_MEMORYTYPE_DEVICE,
+                .dstDevice      = picctx->devicePtr,
+                .dstPitch       = picctx->bufferPitch,
+                .dstY           = dstY,
                 .WidthInBytes   = i_pitch,
-                .Height         = plane.i_visible_lines,
+                .Height         = __MIN(picctx->bufferHeight, p_dec->fmt_out.video.i_y_offset + p_dec->fmt_out.video.i_visible_height),
             };
-            result = CALL_CUDA_DEC(cuMemcpy2D, &cu_cpy);
-            if (result != VLC_SUCCESS)
+            if (i_plane == 1)
+                cu_cpy.Height >>= 1;
+            result = CALL_CUDA_DEC(cuMemcpy2DAsync, &cu_cpy, 0);
+            if (unlikely(result != VLC_SUCCESS))
+            {
+                free(picctx);
                 goto error;
-            srcY += p_sys->decoderHeight;
+            }
+            srcY += picctx->bufferHeight;
+            dstY += p_sys->decoderHeight;
         }
     }
 
+    p_pic->context = &picctx->ctx;
+    vlc_video_context_Hold(picctx->ctx.vctx);
+
     // Release surface on GPU
     result = CALL_CUVID(cuvidUnmapVideoFrame, p_sys->cudecoder, frameDevicePtr);
     if (unlikely(result != VLC_SUCCESS))
@@ -926,41 +862,14 @@ static int OpenDecoder(vlc_object_t *p_this)
         goto error;
     }
 
-    vlc_fourcc_t output_chromas[3];
-    size_t chroma_idx = 0;
-    if (cudaChroma == cudaVideoChromaFormat_420)
-    {
-        if (i_depth_luma >= 16)
-            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_16B;
-        else if (i_depth_luma > 8)
-            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_10B;
-        else
-            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE;
-    }
-    else if (cudaChroma == cudaVideoChromaFormat_444)
-    {
-        if (i_depth_luma > 8)
-            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_444_16B;
-        else
-            output_chromas[chroma_idx++] = VLC_CODEC_NVDEC_OPAQUE_444;
-    }
-
-    output_chromas[chroma_idx++] = MapSurfaceChroma(cudaChroma, i_depth_luma);
-    output_chromas[chroma_idx++] = 0;
-
-    for (chroma_idx = 0; output_chromas[chroma_idx] != 0; chroma_idx++)
+    p_dec->fmt_out.i_codec = p_dec->fmt_out.video.i_chroma =
+        MapSurfaceChroma(cudaChroma, i_depth_luma);
+    result = decoder_UpdateVideoOutput(p_dec, p_sys->vctx_out);
+    if (result != VLC_SUCCESS)
     {
-        p_dec->fmt_out.i_codec = p_dec->fmt_out.video.i_chroma = output_chromas[chroma_idx];
-        result = decoder_UpdateVideoOutput(p_dec, p_sys->vctx_out);
-        if (result == VLC_SUCCESS)
-        {
-            msg_Dbg(p_dec, "using chroma %4.4s", (char*)&p_dec->fmt_out.video.i_chroma);
-            break;
-        }
         msg_Warn(p_dec, "Failed to use output chroma %4.4s", (char*)&p_dec->fmt_out.video.i_chroma);
-    }
-    if (result != VLC_SUCCESS)
         goto error;
+    }
 
     int deinterlace_mode    = var_InheritInteger(p_dec, "nvdec-deint");
     if (deinterlace_mode <= 0)
diff --git a/modules/hw/nvdec/nvdec_fmt.h b/modules/hw/nvdec/nvdec_fmt.h
index 25784cca6d..d84672d8fb 100644
--- a/modules/hw/nvdec/nvdec_fmt.h
+++ b/modules/hw/nvdec/nvdec_fmt.h
@@ -51,15 +51,6 @@ static inline int CudaCheckErr(vlc_object_t *obj, CudaFunctions *cudaFunctions,
     return VLC_SUCCESS;
 }
 
-static inline bool is_nvdec_opaque(vlc_fourcc_t fourcc)
-{
-    return fourcc == VLC_CODEC_NVDEC_OPAQUE ||
-           fourcc == VLC_CODEC_NVDEC_OPAQUE_10B ||
-           fourcc == VLC_CODEC_NVDEC_OPAQUE_16B ||
-           fourcc == VLC_CODEC_NVDEC_OPAQUE_444 ||
-           fourcc == VLC_CODEC_NVDEC_OPAQUE_444_16B;
-}
-
 /* for VLC_CODEC_NVDEC_OPAQUE / VLC_CODEC_NVDEC_OPAQUE_16B */
 typedef struct
 {
diff --git a/modules/hw/nvdec/nvdec_gl.c b/modules/hw/nvdec/nvdec_gl.c
index 137e730028..62a503a8e8 100644
--- a/modules/hw/nvdec/nvdec_gl.c
+++ b/modules/hw/nvdec/nvdec_gl.c
@@ -156,9 +156,6 @@ static void Close(vlc_object_t *obj)
 static int Open(vlc_object_t *obj)
 {
     struct vlc_gl_interop *interop = (void *) obj;
-    if (!is_nvdec_opaque(interop->fmt.i_chroma))
-        return VLC_EGENERIC;
-
     vlc_decoder_device *device = vlc_video_context_HoldDevice(interop->vctx);
     if (device == NULL || device->type != VLC_DECODER_DEVICE_NVDEC)
         return VLC_EGENERIC;
-- 
2.17.1



More information about the vlc-devel mailing list