[vlc-devel] [PATCH v2 4/5] contrib: dav1d: add DXVA support

Fri Sep 11 11:46:08 CEST 2020

Based on (unmerged) code from Matthew Wozniak
https://code.videolan.org/mwozniak/dav1d/-/tree/dxva

If the AV1 structures are found in dxva.h, dav1d is compiled with DXVA support.
The code is only enabled on Windows build.

The host app is responsible for feeding the DXVA API(s). This DXVA only layer
bypasses the software rendering in CPU buffers and fills the DVXA structures
instead.

When the host app provides DXVA callbacks, only one frame thread is used, since
DXVA doesn't like to be fed from multiple threads.
---
 contrib/src/dav1d/0001-Add-DXVA-support.patch | 888 ++++++++++++++++++
 contrib/src/dav1d/rules.mak                   |   5 +
 2 files changed, 893 insertions(+)
 create mode 100644 contrib/src/dav1d/0001-Add-DXVA-support.patch

diff --git a/contrib/src/dav1d/0001-Add-DXVA-support.patch b/contrib/src/dav1d/0001-Add-DXVA-support.patch
new file mode 100644
index 00000000000..78bd311010c
--- /dev/null
+++ b/contrib/src/dav1d/0001-Add-DXVA-support.patch
@@ -0,0 +1,888 @@
+From e0adaa85c877b1e580605d72ba96b5bf37429ba5 Mon Sep 17 00:00:00 2001
+From: Matt Wozniak <mwozniak at microsoft.com>
+Date: Tue, 16 Jun 2020 20:39:12 +0200
+Subject: [PATCH] Add DXVA support
+
+A new set of callbacks have been added which allow an application to have dAV1d generate AV1 DXVA structures.
+The application can then take these generated structures and use them to call DXVA APIs (in DX9, DX11, or DX12).  When using DXVA
+to decode dAV1d software based decoding is disabled and replaced with the DXVA callback functions.
+
+There are 6 new callbacks:
+ * decode_callback - called to perform DXVA decode (using SubmitBuffers)
+ * alloc_callback / release_callback - for allocating and releasing AV1 DXVA picture structs
+ * alloc_bitstream_callback / release_bitstream_callback - used for allocating and releasing
+     GPU backed memory for bitstream data.
+ * check_new_sequence_header - to fallback to software decoding if DXVA can't handle the new
+     data.
+
+ These callbacks are designed to be compatible with all 3 variants of DXVA decoding: DX9, DX11 and DX12.
+ Currently when DXVA decode is enabled (by setting up the DXVA callback structure), the following decoder behaviours are changed:
+ * decoder always operates in single threaded mode (multi-threaded operation only works for DX12)
+ * decoder will skip software decode (dav1d_decode_frame)
+
+ Included is also an update to dav1d.exe cli application to include support for DXVA decode on Windows platforms.
+ This is an example of how to implement a complete DXVA decoding solution using the new callbacks.
+
+ This change has been validated to work with AV1 DXVA hardware.
+
+Co-Authored-By: Steve Lhomme <robux4 at videolabs.io>
+---
+ include/dav1d/dav1d.h   | 100 ++++++++++
+ include/dav1d/picture.h |   8 +
+ meson.build             |  10 +
+ src/decode.c            |  29 ++-
+ src/dxva.c              | 419 ++++++++++++++++++++++++++++++++++++++++
+ src/internal.h          |   9 +-
+ src/lib.c               |  22 ++-
+ src/meson.build         |   6 +
+ src/obu.c               |  17 ++
+ src/picture.h           |   8 +
+ 10 files changed, 619 insertions(+), 9 deletions(-)
+ create mode 100644 src/dxva.c
+
+diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h
+index 32fe8c3..421b7cc 100644
+--- a/include/dav1d/dav1d.h
++++ b/include/dav1d/dav1d.h
+@@ -43,9 +43,33 @@ extern "C" {
+ typedef struct Dav1dContext Dav1dContext;
+ typedef struct Dav1dRef Dav1dRef;
+ 
++typedef struct _DXVA_PicParams_AV1 DXVA_PicParams_AV1;
++typedef struct _DXVA_Tile_AV1 DXVA_Tile_AV1;
++
+ #define DAV1D_MAX_FRAME_THREADS 256
+ #define DAV1D_MAX_TILE_THREADS 64
+ 
++typedef struct Dav1dTileGroup {
++
++    /**
++    * Bitstream data representing the tiles in this group.
++    */
++    Dav1dData data;
++
++    /**
++    * Start and end tile indexes represented by this tile group.
++    */
++    int start, end;
++
++    /**
++    * Byte offset of this tile group from the original bitstream data
++    * passed to dav1d_send_data. Note that this can not be used as a pointer
++    * offset with the data value in this structure; since the data pointed
++    * to is not necessarily the same buffer.
++    */
++    uint32_t offset;
++} Dav1dTileGroup;
++
+ typedef struct Dav1dLogger {
+     void *cookie; ///< Custom data to pass to the callback.
+     /**
+@@ -58,6 +82,81 @@ typedef struct Dav1dLogger {
+     void (*callback)(void *cookie, const char *format, va_list ap);
+ } Dav1dLogger;
+ 
++typedef struct Dav1dDXVA {
++    void *cookie; ///< Custom data to pass to the callback
++    /**
++    * DXVA callback for checking if decode is possible given the current
++    *  sequence header.  When successful, DXVA decode will be used.
++    *  on failure the decoder will use software decode.
++    *
++    * @param          cookie Custom pointer passed to all calls.
++    * @param sequence_header The sequence header for the new stream which will be decoded.
++    *
++    * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
++    *         returning DAV1D_ERR(EAGAIN) will continue decoding using software,
++    *         all other errors wil fail decode.
++    */
++    int(*check_new_sequence_header)(void *cookie, const Dav1dSequenceHeader *sequence_header);
++
++    /**
++    * DXVA Decoding callback.  Called to generate a picture using DXVA.
++    *
++    * @param             cookie Custom pointer passed to all calls.
++    * @param picture_parameters Filled DXVA picture parameters struct.
++    * @param              tiles Array of tile information.
++    * @param            n_tiles Number of tiles in array.
++    * @param     output_picture Picture to decode into.
++    * @param        tile_groups Array of bitstream data
++    * @param   tile_group_count Count of bitstream data entries
++    *
++    * @note  By setting this callback the decoder will no longer perform
++    *        any software decoding processes.  And will operate in single
++    *        threaded mode, no matter the setting of n_frame_threads.
++    *
++    *        Tiles data structure is filled assuming bitstream data is copied into
++    *        one contiguous buffer in-order from the tile_groups array.
++    *
++    *        When using DX11 this function will need to release picture_parameters
++    *        and tiles, both decode_callback and release_callback should be written
++    *        to be able to release these buffers correctly.
++    *
++    * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
++    */
++    int(*decode_callback)(void *cookie, DXVA_PicParams_AV1 **picture_parameters, DXVA_Tile_AV1 **tiles, const int n_tiles, Dav1dPicture *output_picture, Dav1dTileGroup *tile_groups, int n_tile_groups);
++
++    /**
++     * Allocate DXVA picture parameters buffer, and tile array.
++     *
++     * @param    cookie Custom pointer passed to all calls.
++     * @param   picture Picture to allocate DXVA buffers for.
++     * @param picparams Pointer to store the allocated buffer in.
++     * @param     tiles Pointer to store tile array in.
++     * @param   n_tiles Size of tile array to allocate.
++     *
++     * @note   When using DX11 the driver should allocate this
++     *         buffer and this field should be set.  When using
++     *         DX12 it is safe to leave this as the default.
++     *
++     * @return 0 on success. A negative DAV1D_ERR value on error.
++     */
++    int(*alloc_callback)(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **picparams, DXVA_Tile_AV1 **tiles, int n_tiles);
++
++    /**
++     * Release the DXVA picture parameters buffer.
++     *
++     * @param    cookie Custom pointer passed to all calls.
++     * @param picparams The picture params buffer that was
++     *                  allocated by alloc_callback().
++     * @param     tiles The tiles array that was allocated
++     *                  by alloc_callback().
++     *
++     * @note  When using DX11 this function is only necessary for
++     *        cleaning up these buffers in error conditions.
++     */
++    void(*release_callback)(void *cookie, DXVA_PicParams_AV1 **picparams, DXVA_Tile_AV1 **tiles);
++
++} Dav1dDXVA;
++
+ typedef struct Dav1dSettings {
+     int n_frame_threads;
+     int n_tile_threads;
+@@ -68,6 +167,7 @@ typedef struct Dav1dSettings {
+     uint8_t reserved[32]; ///< reserved for future use
+     Dav1dPicAllocator allocator; ///< Picture allocator callback.
+     Dav1dLogger logger; ///< Logger callback.
++    Dav1dDXVA dxva; ///< DXVA callbacks
+ } Dav1dSettings;
+ 
+ /**
+diff --git a/include/dav1d/picture.h b/include/dav1d/picture.h
+index 98e5eb5..f1db209 100644
+--- a/include/dav1d/picture.h
++++ b/include/dav1d/picture.h
+@@ -93,6 +93,14 @@ typedef struct Dav1dPicture {
+     struct Dav1dRef *ref; ///< Frame data allocation origin
+ 
+     void *allocator_data; ///< pointer managed by the allocator
++
++#define DXVA_INVALID_PICTURE_INDEX 0xFFu
++    /**
++     * Used to manage DXVA texture array index when decoding with DXVA,
++     * the allocator should put an index here that will be used to fill out
++     * DXVA_PicParams_AV1::ref_frame_map_texture_index
++     */
++    uint16_t dxva_picture_index;
+ } Dav1dPicture;
+ 
+ typedef struct Dav1dPicAllocator {
+diff --git a/meson.build b/meson.build
+index d5366f9..3f9fe96 100644
+--- a/meson.build
++++ b/meson.build
+@@ -95,6 +95,7 @@ else
+     add_project_arguments('-D_POSIX_C_SOURCE=200112L', language: 'c')
+ endif
+ 
++have_dxva = false
+ if host_machine.system() == 'windows'
+     cdata.set('_WIN32_WINNT',           '0x0601')
+     cdata.set('UNICODE',                1) # Define to 1 for Unicode (Wide Chars) APIs
+@@ -112,6 +113,10 @@ if host_machine.system() == 'windows'
+         optional_arguments += '-mcmodel=small'
+     endif
+ 
++    if cc.has_type('struct _DXVA_PicParams_AV1', prefix: '#include <windows.h>\n#include <dxva.h>\n')
++        have_dxva = true # Only available on newer SDKs/mingw64/wine
++    endif
++
+     # On Windows, we use a compatibility layer to emulate pthread
+     thread_dependency = []
+     thread_compat_dep = declare_dependency(sources : files('src/win32/thread.c'))
+@@ -132,6 +137,7 @@ else
+         cdata.set('HAVE_CLOCK_GETTIME', 1)
+     endif
+ endif
++cdata.set10('HAVE_DXVA', have_dxva)
+ 
+ # check for fseeko on android. It is not always available if _FILE_OFFSET_BITS is defined to 64
+ have_fseeko = true
+@@ -156,6 +162,10 @@ if host_machine.system() == 'linux'
+     endif
+ endif
+ 
++d3d11_dependency = []
++if host_machine.system() == 'windows'
++    d3d11_dependency = cc.find_library('d3d11', required: true)
++endif
+ 
+ # Header checks
+ 
+diff --git a/src/decode.c b/src/decode.c
+index f678215..1e533cf 100644
+--- a/src/decode.c
++++ b/src/decode.c
+@@ -3414,6 +3414,9 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+         }
+         f->n_tile_data_alloc = c->n_tile_data;
+     }
++    // FIXME in single-threaded mode, or if the application ensures
++    //  that each input bitstream buffer is a unique buffer (no re-use)
++    //  this copy is unnecessary
+     memcpy(f->tile, c->tile, c->n_tile_data * sizeof(*f->tile));
+     memset(c->tile, 0, c->n_tile_data * sizeof(*c->tile));
+     f->n_tile_data = c->n_tile_data;
+@@ -3441,6 +3444,15 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+         f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+     }
+ 
++#if HAVE_DXVA
++    // allocate and fill DXVA structures if using DXVA
++    if (c->dxva_mode)
++    {
++        res = dav1d_allocate_frame_dxva(f, c);
++        if (res < 0) goto error;
++    }
++#endif // HAVE_DXVA
++
+     // move f->cur into output queue
+     if (c->n_fc == 1) {
+         if (f->frame_hdr->show_frame)
+@@ -3585,8 +3597,21 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+         }
+     }
+ 
+-    if (c->n_fc == 1) {
+-        if ((res = dav1d_decode_frame(f)) < 0) {
++    // FIXME currently DXVA based decode is always single-threaded
++    //  In DX12 it is possible to schedule DXVA decode multi-threaded.
++    //  This is not relevant to DX11/DX9 as those only support single-threaded operation.
++    if (c->dxva_mode || c->n_fc == 1) {
++#if HAVE_DXVA
++        if (c->dxva_mode)
++        {
++            res = dav1d_decode_frame_dxva(f);
++        }
++        else
++#endif // HAVE_DXVA
++        {
++            res = dav1d_decode_frame(f);
++        }
++        if (res < 0) {
+             dav1d_picture_unref_internal(&c->out);
+             for (int i = 0; i < 8; i++) {
+                 if (refresh_frame_flags & (1 << i)) {
+diff --git a/src/dxva.c b/src/dxva.c
+new file mode 100644
+index 0000000..7f94c2e
+--- /dev/null
++++ b/src/dxva.c
+@@ -0,0 +1,419 @@
++/*
++ * Copyright © 2020, VideoLAN and dav1d authors
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ *    list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ *    this list of conditions and the following disclaimer in the documentation
++ *    and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include "config.h"
++
++#include <errno.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#include "dav1d/headers.h"
++#include "common/intops.h"
++#include "common/mem.h"
++#include "common/validate.h"
++
++#include "src/internal.h"
++#include "src/log.h"
++#include "src/picture.h"
++#include "src/ref.h"
++#include <dxva.h>
++
++#define DXVA_INVALID_QM 0xFFu
++
++static inline int dav1d_restoration_type_to_dxva(const enum Dav1dRestorationType type) {
++    switch (type) {
++    case DAV1D_RESTORATION_NONE:       return 0;
++    case DAV1D_RESTORATION_SWITCHABLE: return 3;
++    case DAV1D_RESTORATION_WIENER:     return 1;
++    case DAV1D_RESTORATION_SGRPROJ:    return 2;
++    }
++    return 0;
++}
++
++static int fill_picparams_struct(DXVA_PicParams_AV1* params, const Dav1dFrameContext *fc, Dav1dContext *const c) {
++    params->CurrPicTextureIndex = (unsigned char)fc->cur.dxva_picture_index;
++
++    // Basics
++    const int bitdepthTranslation[] = { 8,10,12 };
++    params->width = fc->frame_hdr->width[0];
++    params->height = fc->frame_hdr->height;
++    params->max_width = fc->seq_hdr->max_width;
++    params->max_height = fc->seq_hdr->max_height;
++
++    params->superres_denom = fc->frame_hdr->super_res.width_scale_denominator;
++
++    if (fc->seq_hdr->hbd > 2)
++        return DAV1D_ERR(EINVAL);
++
++    params->bitdepth = bitdepthTranslation[fc->seq_hdr->hbd];
++    params->seq_profile = fc->seq_hdr->profile;
++    params->interp_filter = (int)fc->frame_hdr->subpel_filter_mode; // enum same as dxva
++
++    // Tiles
++    params->tiles.cols = fc->frame_hdr->tiling.cols;
++    params->tiles.rows = fc->frame_hdr->tiling.rows;
++    params->tiles.context_update_id = fc->frame_hdr->tiling.update;
++
++    // AV1 DXVA defines tiles in terms of tile width / height in SBs.
++    // dAV1d specifies the start offset of each tile
++    uint16_t last = fc->frame_hdr->tiling.col_start_sb[0];
++    for (int w = 1; w <= fc->frame_hdr->tiling.cols; w++) {
++        params->tiles.widths[w-1] = fc->frame_hdr->tiling.col_start_sb[w] - last;
++        last = fc->frame_hdr->tiling.col_start_sb[w];
++    }
++    last = fc->frame_hdr->tiling.row_start_sb[0];
++    for (int h = 1; h <= fc->frame_hdr->tiling.rows; h++) {
++        params->tiles.heights[h-1] = fc->frame_hdr->tiling.row_start_sb[h] - last;
++        last = fc->frame_hdr->tiling.row_start_sb[h];
++    }
++
++    // Coding Tools
++    params->coding.use_128x128_superblock = fc->seq_hdr->sb128;
++    params->coding.intra_edge_filter = fc->seq_hdr->intra_edge_filter;
++    params->coding.interintra_compound = fc->seq_hdr->inter_intra;
++    params->coding.masked_compound = fc->seq_hdr->masked_compound;
++    params->coding.warped_motion = fc->frame_hdr->warp_motion;
++    params->coding.dual_filter = fc->seq_hdr->dual_filter;
++    params->coding.jnt_comp = fc->seq_hdr->jnt_comp;
++    params->coding.screen_content_tools = fc->frame_hdr->allow_screen_content_tools;
++    params->coding.integer_mv = fc->frame_hdr->force_integer_mv;
++    params->coding.cdef = fc->seq_hdr->cdef;
++    params->coding.restoration = fc->seq_hdr->restoration;
++    params->coding.film_grain = fc->seq_hdr->film_grain_present;
++    params->coding.intrabc = fc->frame_hdr->allow_intrabc;
++    params->coding.high_precision_mv = fc->frame_hdr->hp;
++    params->coding.switchable_motion_mode = fc->frame_hdr->switchable_motion_mode;
++    params->coding.filter_intra = fc->seq_hdr->filter_intra;
++    params->coding.disable_frame_end_update_cdf = !fc->frame_hdr->refresh_context;
++    params->coding.disable_cdf_update = fc->frame_hdr->disable_cdf_update;
++    params->coding.reference_mode = fc->frame_hdr->switchable_comp_refs;
++    params->coding.skip_mode = fc->frame_hdr->skip_mode_enabled;
++    params->coding.reduced_tx_set = fc->frame_hdr->reduced_txtp_set;
++    params->coding.superres = fc->frame_hdr->super_res.enabled;
++    params->coding.tx_mode = (int)fc->frame_hdr->txfm_mode; // enum same as dxva
++    params->coding.use_ref_frame_mvs = fc->frame_hdr->use_ref_frame_mvs;
++    params->coding.enable_ref_frame_mvs = fc->seq_hdr->ref_frame_mvs;
++    params->coding.reference_frame_update = !(fc->frame_hdr->show_existing_frame == 1 && fc->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY);
++
++    // Format & Picture Info flags
++    params->format.frame_type = (int)fc->frame_hdr->frame_type; // enum is same as dxva
++    params->format.show_frame = fc->frame_hdr->show_frame;
++    params->format.showable_frame = fc->frame_hdr->showable_frame;
++    switch (fc->cur.p.layout) {
++    case DAV1D_PIXEL_LAYOUT_I400:
++    case DAV1D_PIXEL_LAYOUT_I420:
++        params->format.subsampling_x = 1;
++        params->format.subsampling_y = 1;
++        break;
++    case DAV1D_PIXEL_LAYOUT_I422:
++        params->format.subsampling_x = 1;
++        params->format.subsampling_y = 0;
++        break;
++    case DAV1D_PIXEL_LAYOUT_I444:
++        params->format.subsampling_x = 0;
++        params->format.subsampling_y = 0;
++        break;
++    default:
++        return DAV1D_ERR(EINVAL);
++    }
++    params->format.mono_chrome = fc->seq_hdr->monochrome;
++
++    // References
++    params->order_hint = fc->frame_hdr->frame_offset;
++    params->order_hint_bits = fc->seq_hdr->order_hint_n_bits;
++
++    params->primary_ref_frame = fc->frame_hdr->primary_ref_frame;
++
++    memset(params->RefFrameMapTextureIndex, 0xFF, sizeof(params->RefFrameMapTextureIndex));
++    for (int i = 0; i < DAV1D_REFS_PER_FRAME; i++) {
++        const int idx = fc->frame_hdr->refidx[i];
++        if (c->refs[idx].p.p.data[0]) {
++            params->frame_refs[i].Index = idx;
++        } else {
++            params->frame_refs[i].Index = DXVA_INVALID_PICTURE_INDEX;
++        }
++        memcpy(params->frame_refs[i].wmmat, fc->frame_hdr->gmv[i].matrix, 6 * sizeof(int));
++        params->frame_refs[i].wmtype = (int)fc->frame_hdr->gmv[i].type; // enum same as dxva
++        params->frame_refs[i].wminvalid = fc->frame_hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY;
++        params->frame_refs[i].width = c->refs[idx].p.p.p.w;
++        params->frame_refs[i].height = c->refs[idx].p.p.p.h;
++    }
++    for (int i = 0; i < 8; i++) {
++        params->RefFrameMapTextureIndex[i] = (uint8_t)c->refs[i].p.p.dxva_picture_index;
++    }
++
++    // Loop filter parameters
++    params->loop_filter.filter_level[0] = fc->frame_hdr->loopfilter.level_y[0];
++    params->loop_filter.filter_level[1] = fc->frame_hdr->loopfilter.level_y[1];
++    params->loop_filter.filter_level_u = fc->frame_hdr->loopfilter.level_u;
++    params->loop_filter.filter_level_v = fc->frame_hdr->loopfilter.level_v;
++    params->loop_filter.sharpness_level = fc->frame_hdr->loopfilter.sharpness;
++    params->loop_filter.mode_ref_delta_enabled = fc->frame_hdr->loopfilter.mode_ref_delta_enabled;
++    params->loop_filter.mode_ref_delta_update = fc->frame_hdr->loopfilter.mode_ref_delta_update;
++    params->loop_filter.delta_lf_multi = fc->frame_hdr->delta.lf.multi;
++    params->loop_filter.delta_lf_present = fc->frame_hdr->delta.lf.present;
++    params->loop_filter.mode_deltas[0] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[0];
++    params->loop_filter.mode_deltas[1] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[1];
++    for (int i = 0; i < DAV1D_TOTAL_REFS_PER_FRAME; i++) {
++        params->loop_filter.ref_deltas[i] = fc->frame_hdr->loopfilter.mode_ref_deltas.ref_delta[i];
++    }
++    params->loop_filter.delta_lf_res = fc->frame_hdr->delta.lf.res_log2;
++    char haverestoration = fc->frame_hdr->restoration.type[0] || fc->frame_hdr->restoration.type[1] || fc->frame_hdr->restoration.type[2];
++    for (int i = 0; i < 3; i++) {
++        params->loop_filter.frame_restoration_type[i] = dav1d_restoration_type_to_dxva(fc->frame_hdr->restoration.type[i]);
++        params->loop_filter.log2_restoration_unit_size[i] = haverestoration ? fc->frame_hdr->restoration.unit_size[min(i, 1)] : 8; // dav1d only tracks y and uv not y,u,v
++    }
++
++    // Quantization
++    params->quantization.delta_q_present = fc->frame_hdr->delta.q.present;
++    params->quantization.delta_q_res = fc->frame_hdr->delta.q.res_log2;
++    params->quantization.base_qindex = fc->frame_hdr->quant.yac;
++    params->quantization.y_dc_delta_q = fc->frame_hdr->quant.ydc_delta;
++    params->quantization.u_dc_delta_q = fc->frame_hdr->quant.udc_delta;
++    params->quantization.v_dc_delta_q = fc->frame_hdr->quant.vdc_delta;
++    params->quantization.u_ac_delta_q = fc->frame_hdr->quant.uac_delta;
++    params->quantization.v_ac_delta_q = fc->frame_hdr->quant.vac_delta;
++    if (fc->frame_hdr->quant.qm)
++    {
++        params->quantization.qm_y = fc->frame_hdr->quant.qm_y;
++        params->quantization.qm_u = fc->frame_hdr->quant.qm_u;
++        params->quantization.qm_v = fc->frame_hdr->quant.qm_v;
++    }
++    else
++    {
++        params->quantization.qm_y = DXVA_INVALID_QM;
++        params->quantization.qm_u = DXVA_INVALID_QM;
++        params->quantization.qm_v = DXVA_INVALID_QM;
++    }
++
++    // Cdef parameters
++    params->cdef.damping = fc->frame_hdr->cdef.damping - 3;
++    params->cdef.bits = fc->frame_hdr->cdef.n_bits;
++    for (int i = 0; i < DAV1D_MAX_CDEF_STRENGTHS; i++) {
++        params->cdef.y_strengths[i].primary = fc->frame_hdr->cdef.y_strength[i] >> 2;
++        params->cdef.y_strengths[i].secondary = fc->frame_hdr->cdef.y_strength[i] & 0x3;
++        params->cdef.uv_strengths[i].primary = fc->frame_hdr->cdef.uv_strength[i] >> 2;
++        params->cdef.uv_strengths[i].secondary = fc->frame_hdr->cdef.uv_strength[i] & 0x3;
++    }
++
++    // Segmentation
++    params->segmentation.enabled = fc->frame_hdr->segmentation.enabled;
++    params->segmentation.update_map = fc->frame_hdr->segmentation.update_map;
++    params->segmentation.update_data = fc->frame_hdr->segmentation.update_data;
++    params->segmentation.temporal_update = fc->frame_hdr->segmentation.temporal;
++    for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
++        params->segmentation.feature_mask[i].alt_q = fc->frame_hdr->segmentation.seg_data.d[i].delta_q != 0;
++        params->segmentation.feature_data[i][0] = fc->frame_hdr->segmentation.seg_data.d[i].delta_q;
++
++        params->segmentation.feature_mask[i].alt_lf_y_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v != 0;
++        params->segmentation.feature_data[i][1] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v;
++
++        params->segmentation.feature_mask[i].alt_lf_y_h = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h != 0;
++        params->segmentation.feature_data[i][2] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h;
++
++        params->segmentation.feature_mask[i].alt_lf_u = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u != 0;
++        params->segmentation.feature_data[i][3] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u;
++
++        params->segmentation.feature_mask[i].alt_lf_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v != 0;
++        params->segmentation.feature_data[i][4] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v;
++
++        params->segmentation.feature_mask[i].ref_frame = fc->frame_hdr->segmentation.seg_data.d[i].ref != -1;
++        params->segmentation.feature_data[i][5] = fc->frame_hdr->segmentation.seg_data.d[i].ref == -1 ? 0 : fc->frame_hdr->segmentation.seg_data.d[i].ref;
++
++        params->segmentation.feature_mask[i].skip = fc->frame_hdr->segmentation.seg_data.d[i].skip != 0;
++        params->segmentation.feature_data[i][6] = fc->frame_hdr->segmentation.seg_data.d[i].skip;
++
++        params->segmentation.feature_mask[i].globalmv = fc->frame_hdr->segmentation.seg_data.d[i].globalmv != 0;
++        params->segmentation.feature_data[i][7] = fc->frame_hdr->segmentation.seg_data.d[i].globalmv;
++    }
++
++    // Film Grain
++    if (fc->frame_hdr->film_grain.present) {
++        params->film_grain.apply_grain = 1;
++        params->film_grain.scaling_shift_minus8 = fc->frame_hdr->film_grain.data.scaling_shift - 8;
++        params->film_grain.chroma_scaling_from_luma = fc->frame_hdr->film_grain.data.chroma_scaling_from_luma;
++        params->film_grain.ar_coeff_lag = fc->frame_hdr->film_grain.data.ar_coeff_lag;
++        params->film_grain.ar_coeff_shift_minus6 = (uint16_t)(fc->frame_hdr->film_grain.data.ar_coeff_shift - 6);
++        params->film_grain.grain_scale_shift = fc->frame_hdr->film_grain.data.grain_scale_shift;
++        params->film_grain.overlap_flag = fc->frame_hdr->film_grain.data.overlap_flag;
++        params->film_grain.clip_to_restricted_range = fc->frame_hdr->film_grain.data.clip_to_restricted_range;
++        params->film_grain.matrix_coeff_is_identity = fc->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
++        params->film_grain.grain_seed = fc->frame_hdr->film_grain.data.seed;
++        memcpy(params->film_grain.scaling_points_y, fc->frame_hdr->film_grain.data.y_points, 14 * 2);
++        params->film_grain.num_y_points = fc->frame_hdr->film_grain.data.num_y_points;
++        memcpy(params->film_grain.scaling_points_cb, fc->frame_hdr->film_grain.data.uv_points[0], 10 * 2);
++        params->film_grain.num_cb_points = fc->frame_hdr->film_grain.data.num_uv_points[0];
++        memcpy(params->film_grain.scaling_points_cr, fc->frame_hdr->film_grain.data.uv_points[1], 10 * 2);
++        params->film_grain.num_cr_points = fc->frame_hdr->film_grain.data.num_uv_points[1];
++        for (int i = 0; i < 24; i++) {
++            params->film_grain.ar_coeffs_y[i] = (UCHAR)((int)fc->frame_hdr->film_grain.data.ar_coeffs_y[i] + 128);
++        }
++        for (int i = 0; i < 25; i++) {
++            params->film_grain.ar_coeffs_cb[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[0][i] + 128;
++            params->film_grain.ar_coeffs_cr[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[1][i] + 128;
++        }
++        params->film_grain.cb_mult = fc->frame_hdr->film_grain.data.uv_mult[0] + 128;
++        params->film_grain.cb_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[0] + 128;
++        params->film_grain.cr_mult = fc->frame_hdr->film_grain.data.uv_mult[1] + 128;
++        params->film_grain.cr_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[1] + 128;
++        params->film_grain.cb_offset = fc->frame_hdr->film_grain.data.uv_offset[0] + 256;
++        params->film_grain.cr_offset = fc->frame_hdr->film_grain.data.uv_offset[1] + 256;
++    }
++
++    return 0;
++}
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c) {
++    int retval = DAV1D_ERR(ENOMEM);
++    DXVA_PicParams_AV1 *params = NULL;
++    DXVA_Tile_AV1 *tiles = NULL;
++    int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++    if ((retval = c->dxva.alloc_callback(c->dxva.cookie, &f->cur, &params, &tiles, tile_count)) < 0)
++        return retval;
++
++    if ((retval = fill_picparams_struct(params, f, c)) < 0)
++        goto error;
++
++    f->dxva_params = params;
++    f->dxva_tiles = tiles;
++    return 0;
++
++error:
++    c->dxva.release_callback(c->dxva.cookie, &params, &tiles);
++    return retval;
++}
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f) {
++    int retval = DAV1D_ERR(ENOMEM);
++    uint8_t *bitstream = NULL;
++    int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++    // Construct tile list
++    //  Tiles are copied into GPU memory as one contiguous block, with each
++    //  tile having an entry in the tile list (DXVA_Tile_AV1); specifying its offset
++    //  into the GPU buffer and its size.
++    int tile_row = 0, tile_col = 0, tile_index = 0;
++    size_t total_data_offset = 0;
++    DXVA_Tile_AV1 *current_tile = f->dxva_tiles;
++    for (int i = 0; i < f->n_tile_data; i++) {
++        const uint8_t *data = f->tile[i].data.data;
++        size_t size = f->tile[i].data.sz;
++
++        for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
++            size_t tile_sz;
++            if (j == f->tile[i].end) {
++                tile_sz = size;
++            } else {
++                if (f->frame_hdr->tiling.n_bytes > size)
++                    goto error;
++                tile_sz = 0;
++                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++) {
++                    tile_sz |= (unsigned)*data++ << (k * 8);
++                    total_data_offset++;
++                }
++                tile_sz++;
++                size -= f->frame_hdr->tiling.n_bytes;
++                if (tile_sz > size)
++                    goto error;
++            }
++
++            if (tile_index > tile_count)
++                goto error;
++
++            current_tile->DataOffset = (uint32_t)total_data_offset;
++            current_tile->DataSize = (uint32_t)tile_sz;
++            current_tile->row = tile_row;
++            current_tile->column = tile_col++;
++            // large scale tile decoding process is not supported
++            current_tile->anchor_frame = DXVA_INVALID_PICTURE_INDEX;
++
++            if (tile_col == f->frame_hdr->tiling.cols) {
++                tile_col = 0;
++                tile_row++;
++            }
++
++            total_data_offset += tile_sz;
++            size -= tile_sz;
++            data += tile_sz;
++            tile_index++;
++            current_tile++;
++        }
++    }
++
++    if ((retval = f->c->dxva.decode_callback(f->c->dxva.cookie, &f->dxva_params, &f->dxva_tiles, tile_count, &f->cur, f->tile, f->n_tile_data)) < 0)
++        goto error;
++
++    retval = 0;
++
++error:
++    f->c->dxva.release_callback(f->c->dxva.cookie, &f->dxva_params, &f->dxva_tiles);
++
++    for (int i = 0; i < 7; i++) {
++        if (f->refp[i].p.data[0])
++            dav1d_thread_picture_unref(&f->refp[i]);
++        dav1d_ref_dec(&f->ref_mvs_ref[i]);
++    }
++
++    dav1d_picture_unref_internal(&f->cur);
++    dav1d_thread_picture_unref(&f->sr_cur);
++    dav1d_cdf_thread_unref(&f->in_cdf);
++    if (f->frame_hdr->refresh_context) {
++        dav1d_cdf_thread_signal(&f->out_cdf);
++        dav1d_cdf_thread_unref(&f->out_cdf);
++    }
++    dav1d_ref_dec(&f->cur_segmap_ref);
++    dav1d_ref_dec(&f->prev_segmap_ref);
++    dav1d_ref_dec(&f->mvs_ref);
++    dav1d_ref_dec(&f->seq_hdr_ref);
++    dav1d_ref_dec(&f->frame_hdr_ref);
++
++    for (int i = 0; i < f->n_tile_data; i++)
++        dav1d_data_unref_internal(&f->tile[i].data);
++
++    return retval;
++}
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++
++    // This default implementation does not work with DX11, which requires that the driver allocate
++    // these buffers; but will work with DX12 which uses normal CPU allocated buffers for this data
++    *pic = (DXVA_PicParams_AV1*)calloc(1, sizeof(DXVA_PicParams_AV1));
++    if (!*pic) return DAV1D_ERR(ENOMEM);
++    *tiles = (DXVA_Tile_AV1*)calloc(n_tiles, sizeof(DXVA_Tile_AV1));
++    if (!*tiles) return DAV1D_ERR(ENOMEM);
++    return 0;
++}
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles) {
++    if (*pic) free(*pic);
++    *pic = NULL;
++
++    if(*tiles) free(*tiles);
++    *tiles = NULL;
++}
+diff --git a/src/internal.h b/src/internal.h
+index 07f5676..239006b 100644
+--- a/src/internal.h
++++ b/src/internal.h
+@@ -67,11 +67,6 @@ typedef struct Dav1dDSPContext {
+     Dav1dLoopRestorationDSPContext lr;
+ } Dav1dDSPContext;
+ 
+-struct Dav1dTileGroup {
+-    Dav1dData data;
+-    int start, end;
+-};
+-
+ struct Dav1dContext {
+     Dav1dFrameContext *fc;
+     unsigned n_fc;
+@@ -133,8 +128,10 @@ struct Dav1dContext {
+     int all_layers;
+     unsigned frame_size_limit;
+     int drain;
++    int dxva_mode;
+ 
+     Dav1dLogger logger;
++    Dav1dDXVA dxva;
+ };
+ 
+ struct Dav1dFrameContext {
+@@ -157,6 +154,8 @@ struct Dav1dFrameContext {
+     struct Dav1dTileGroup *tile;
+     int n_tile_data_alloc;
+     int n_tile_data;
++    DXVA_PicParams_AV1 *dxva_params; // DXVA
++    DXVA_Tile_AV1 *dxva_tiles; // DXVA
+ 
+     // for scalable references
+     struct ScalableMotionParams {
+diff --git a/src/lib.c b/src/lib.c
+index 82af64a..619c4e0 100644
+--- a/src/lib.c
++++ b/src/lib.c
+@@ -75,6 +75,11 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
+     s->operating_point = 0;
+     s->all_layers = 1; // just until the tests are adjusted
+     s->frame_size_limit = 0;
++    s->dxva = (Dav1dDXVA) { 0 };
++#if HAVE_DXVA
++    s->dxva.alloc_callback = dav1d_default_dxva_alloc;
++    s->dxva.release_callback = dav1d_default_dxva_release;
++#endif // !HAVE_DXVA
+ }
+ 
+ static void close_internal(Dav1dContext **const c_out, int flush);
+@@ -111,6 +116,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+                           DAV1D_ERR(EINVAL));
+     validate_input_or_ret(s->operating_point >= 0 &&
+                           s->operating_point <= 31, DAV1D_ERR(EINVAL));
++    validate_input_or_ret(s->dxva.decode_callback == NULL ||
++                          s->dxva.alloc_callback != NULL,
++                          DAV1D_ERR(EINVAL));
++    validate_input_or_ret(s->dxva.decode_callback == NULL ||
++                          s->dxva.release_callback != NULL,
++                          DAV1D_ERR(EINVAL));
+ 
+     pthread_attr_t thread_attr;
+     if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+@@ -124,6 +135,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ 
+     c->allocator = s->allocator;
+     c->logger = s->logger;
++    c->dxva = s->dxva;
+     c->apply_grain = s->apply_grain;
+     c->operating_point = s->operating_point;
+     c->all_layers = s->all_layers;
+@@ -140,9 +152,15 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+                       s->frame_size_limit, c->frame_size_limit);
+     }
+ 
++#if HAVE_DXVA
++    c->dxva_mode = !!s->dxva.decode_callback;
++#else
++    c->dxva_mode = 0;
++#endif
++
+     c->frame_thread.flush = &c->frame_thread.flush_mem;
+     atomic_init(c->frame_thread.flush, 0);
+-    c->n_fc = s->n_frame_threads;
++    c->n_fc = s->dxva.decode_callback ? 1 : s->n_frame_threads;
+     c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
+     if (!c->fc) goto error;
+     memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+@@ -280,7 +298,7 @@ static int output_image(Dav1dContext *const c, Dav1dPicture *const out,
+                     fgdata->num_uv_points[1];
+ 
+     // If there is nothing to be done, skip the allocation/copy
+-    if (!c->apply_grain || !has_grain) {
++    if (!c->apply_grain || !has_grain || c->dxva_mode) {
+         dav1d_picture_move_ref(out, in);
+         return 0;
+     }
+diff --git a/src/meson.build b/src/meson.build
+index fd8ad02..83ff4eb 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -50,6 +50,12 @@ libdav1d_sources = files(
+     'wedge.c',
+ )
+ 
++if have_dxva
++    libdav1d_sources += files(
++        'dxva.c',
++    )
++endif
++
+ # libdav1d bitdepth source files
+ # These files are compiled for each bitdepth with
+ # `BITDEPTH` defined to the currently built bitdepth.
+diff --git a/src/obu.c b/src/obu.c
+index ab9688c..de63b6e 100644
+--- a/src/obu.c
++++ b/src/obu.c
+@@ -1242,13 +1242,29 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
+         // the old one, this is a new video sequence and can't use any
+         // previous state. Free that state.
+         if (!c->seq_hdr)
++        {
++#if HAVE_DXVA
++            if (c->dxva.check_new_sequence_header) {
++                // decide the first DXVA mode
++                int dxva_check = c->dxva.check_new_sequence_header(c->dxva.cookie, seq_hdr);
++                c->dxva_mode = dxva_check == 0;
++            }
++#endif
+             c->frame_hdr = NULL;
++        }
+         // see 7.5, operating_parameter_info is allowed to change in
+         // sequence headers of a single sequence
+         else if (memcmp(seq_hdr, c->seq_hdr, offsetof(Dav1dSequenceHeader, operating_parameter_info))) {
+             c->frame_hdr = NULL;
+             c->mastering_display = NULL;
+             c->content_light = NULL;
++#if HAVE_DXVA
++            if (c->dxva.check_new_sequence_header) {
++                int dxva_check = c->dxva.check_new_sequence_header(c->dxva.cookie, seq_hdr);
++                c->dxva_mode = dxva_check == 0;
++                if (dxva_check != DAV1D_ERR(EAGAIN)) return dxva_check;
++            }
++#endif
+             dav1d_ref_dec(&c->mastering_display_ref);
+             dav1d_ref_dec(&c->content_light_ref);
+             for (int i = 0; i < 8; i++) {
+@@ -1346,6 +1362,7 @@ int dav1d_parse_obus(Dav1dContext *const c, Dav1dData *const in, const int globa
+         assert((bit_pos & 7) == 0);
+         assert(pkt_bytelen >= (bit_pos >> 3));
+         dav1d_data_ref(&c->tile[c->n_tile_data].data, in);
++        c->tile[c->n_tile_data].offset = bit_pos >> 3;
+         c->tile[c->n_tile_data].data.data += bit_pos >> 3;
+         c->tile[c->n_tile_data].data.sz = pkt_bytelen - (bit_pos >> 3);
+         // ensure tile groups are in order and sane, see 6.10.1
+diff --git a/src/picture.h b/src/picture.h
+index 9f82de8..4c0f9be 100644
+--- a/src/picture.h
++++ b/src/picture.h
+@@ -109,4 +109,12 @@ int dav1d_default_picture_alloc(Dav1dPicture *p, void *cookie);
+ void dav1d_default_picture_release(Dav1dPicture *p, void *cookie);
+ void dav1d_picture_unref_internal(Dav1dPicture *p);
+ 
++#if HAVE_DXVA
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles);
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles);
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c);
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f);
++#endif
++
+ #endif /* DAV1D_SRC_PICTURE_H */
+-- 
+2.27.0.windows.1
+
diff --git a/contrib/src/dav1d/rules.mak b/contrib/src/dav1d/rules.mak
index fe0e222b166..c9e703f197b 100644
--- a/contrib/src/dav1d/rules.mak
+++ b/contrib/src/dav1d/rules.mak
@@ -8,6 +8,10 @@ ifeq ($(call need_pkg,"dav1d"),)
 PKGS_FOUND += dav1d
 endif
 
+ifdef HAVE_WIN32
+DEPS_dav1d += wine-headers
+endif
+
 DAV1D_CONF = -D enable_tests=false -D enable_tools=false
 
 $(TARBALLS)/dav1d-$(DAV1D_VERSION).tar.xz:
@@ -19,6 +23,7 @@ $(TARBALLS)/dav1d-$(DAV1D_VERSION).tar.xz:
 dav1d: dav1d-$(DAV1D_VERSION).tar.xz .sum-dav1d
 	$(UNPACK)
 	$(APPLY) $(SRC)/dav1d/0001-SSE2-PIC-464ca6c2.patch
+	$(APPLY) $(SRC)/dav1d/0001-Add-DXVA-support.patch
 	$(MOVE)
 
 .dav1d: dav1d crossfile.meson
-- 
2.26.2