[vlc-devel] [PATCH 1/2] contrib: dav1d: add DXVA support

Thu Sep 3 15:44:03 CEST 2020

Based on (unmerged) code from Matthew Wozniak
https://code.videolan.org/mwozniak/dav1d/-/tree/dxva
---
 contrib/src/dav1d/0001-Add-DXVA-support.patch | 1800 +++++++++++++++++
 contrib/src/dav1d/rules.mak                   |    1 +
 2 files changed, 1801 insertions(+)
 create mode 100644 contrib/src/dav1d/0001-Add-DXVA-support.patch

diff --git a/contrib/src/dav1d/0001-Add-DXVA-support.patch b/contrib/src/dav1d/0001-Add-DXVA-support.patch
new file mode 100644
index 00000000000..91b1f6b2eaf
--- /dev/null
+++ b/contrib/src/dav1d/0001-Add-DXVA-support.patch
@@ -0,0 +1,1800 @@
+From 0117cce8907432fe673ab511bf11ed81e8c81b2a Mon Sep 17 00:00:00 2001
+From: Matt Wozniak <mwozniak at microsoft.com>
+Date: Tue, 16 Jun 2020 20:39:12 +0200
+Subject: [PATCH] Add DXVA support A new set of callbacks have been added which
+ allow an application to have dAV1d generate AV1 DXVA structures. The
+ application can then take these generated structures and use them to call
+ DXVA APIs (in DX9, DX11, or DX12).  When using DXVA to decode dAV1d software
+ based decoding is disabled and replaced with the DXVA callback functions.
+
+There are 5 new callbacks:
+ * decode_callback - called to perform DXVA decode (using SubmitBuffers)
+ * alloc_callback / release_callback - for allocating and releasing AV1 DXVA picture structs
+ * alloc_bitstream_callback / release_bitstream_callback - used for allocating and releasing
+     GPU backed memory for bitstream data.
+
+ These callbacks are designed to be compatible with all 3 variants of DXVA decoding: DX9, DX11 and DX12.
+ Currently when DXVA decode is enabled (by setting up the DXVA callback structure), the following decoder behaviours are changed:
+ * decoder always operates in single threaded mode (multi-threaded operation only works for DX12)
+ * decoder will skip software decode (dav1d_decode_frame)
+
+ Included is also an update to dav1d.exe cli application to include support for DXVA decode on Windows platforms.
+ This is an example of how to implement a complete DXVA decoding solution using the new callbacks.
+
+ This change has been validated to work with AV1 DXVA hardware.
+---
+ include/dav1d/dav1d.h     |  71 +++++++
+ include/dav1d/dxva_av1.h  | 302 ++++++++++++++++++++++++++
+ include/dav1d/meson.build |  20 +-
+ include/dav1d/picture.h   |   7 +
+ meson.build               |   6 +-
+ src/decode.c              |  27 ++-
+ src/dxva.c                | 432 ++++++++++++++++++++++++++++++++++++++
+ src/dxva.h                |  44 ++++
+ src/internal.h            |   8 +-
+ src/lib.c                 |  19 +-
+ src/meson.build           |   6 +
+ tools/dav1d.c             |  23 ++
+ tools/dav1d_cli_dxva.c    | 403 +++++++++++++++++++++++++++++++++++
+ tools/dav1d_cli_dxva.h    |  63 ++++++
+ tools/dav1d_cli_parse.c   |  12 +-
+ tools/dav1d_cli_parse.h   |   1 +
+ tools/meson.build         |   8 +-
+ 17 files changed, 1436 insertions(+), 16 deletions(-)
+ create mode 100644 include/dav1d/dxva_av1.h
+ create mode 100644 src/dxva.c
+ create mode 100644 src/dxva.h
+ create mode 100644 tools/dav1d_cli_dxva.c
+ create mode 100644 tools/dav1d_cli_dxva.h
+
+diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h
+index 32fe8c3..1143c1f 100644
+--- a/include/dav1d/dav1d.h
++++ b/include/dav1d/dav1d.h
+@@ -43,9 +43,25 @@ extern "C" {
+ typedef struct Dav1dContext Dav1dContext;
+ typedef struct Dav1dRef Dav1dRef;
+ 
++typedef struct _DXVA_PicParams_AV1 DXVA_PicParams_AV1;
++typedef struct _DXVA_Tile_AV1 DXVA_Tile_AV1;
++
+ #define DAV1D_MAX_FRAME_THREADS 256
+ #define DAV1D_MAX_TILE_THREADS 64
+ 
++typedef struct Dav1dTileGroup {
++
++    /**
++    * Bitstream data representing the tiles in this group.
++    */
++    Dav1dData data;
++
++    /**
++    * Start and end tile indexes represented by this tile group.
++    */
++    int start, end;
++} Dav1dTileGroup;
++
+ typedef struct Dav1dLogger {
+     void *cookie; ///< Custom data to pass to the callback.
+     /**
+@@ -58,6 +74,60 @@ typedef struct Dav1dLogger {
+     void (*callback)(void *cookie, const char *format, va_list ap);
+ } Dav1dLogger;
+ 
++typedef struct Dav1dDXVA {
++    void *cookie; ///< Custom data to pass to the callback
++    /**
++    * DXVA Decoding callback.  Called to generate a picture using DXVA.
++    *
++    * @param             cookie Custom pointer passed to all calls.
++    * @param picture_parameters Filled DXVA picture parameters struct.
++    * @param              tiles Array of tile information.
++    * @param            n_tiles Number of tiles in array.
++    * @param     output_picture Picture to decode into.
++    * @param        tile_groups Array of bitstream data  
++    * @param   tile_group_count Count of bitstream data entries
++    *
++    * @note  By setting this callback the decoder will no longer perform
++    *        any software decoding processes.  And will operate in single
++    *        threaded mode, no matter the setting of n_frame_threads.
++    *        
++    *        Tiles data structure is filled assuming bitstream data is copied into
++    *        one contiguous buffer in-order from the tile_groups array.
++    *
++    * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
++    */
++    int(*decode_callback)(void *cookie, DXVA_PicParams_AV1 *picture_parameters, DXVA_Tile_AV1 *tiles, const int n_tiles, Dav1dPicture *output_picture, Dav1dTileGroup *tile_groups, int tile_group_count);
++
++    /**
++     * Allocate DXVA picture parameters buffer, and tile array.
++     *
++     * @param    cookie Custom pointer passed to all calls.
++     * @param   picture Picture to allocate DXVA buffers for.
++     * @param picparams Pointer to store the allocated buffer in.
++     * @param     tiles Pointer to store tile array in.
++     * @param   n_tiles Size of tile array to allocate.
++     *
++     * @note   When using DX11 the driver should allocate this
++     *         buffer and this field should be set.  When using
++     *         DX12 it is safe to leave this as the default.
++     *
++     * @return 0 on success. A negative DAV1D_ERR value on error.
++     */
++    int(*alloc_callback)(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **picparams, DXVA_Tile_AV1 **tiles, int n_tiles);
++
++    /**
++     * Release the DXVA picture parameters buffer.
++     *
++     * @param    cookie Custom pointer passed to all calls.
++     * @param picparams The picture params buffer that was
++     *                  allocated by alloc_callback().
++     * @param     tiles The tiles array that was allocated
++     *                  by alloc_callback().
++     */
++    void(*release_callback)(void *cookie, DXVA_PicParams_AV1 *picparams, DXVA_Tile_AV1 *tiles);
++
++} Dav1dDXVA;
++
+ typedef struct Dav1dSettings {
+     int n_frame_threads;
+     int n_tile_threads;
+@@ -68,6 +138,7 @@ typedef struct Dav1dSettings {
+     uint8_t reserved[32]; ///< reserved for future use
+     Dav1dPicAllocator allocator; ///< Picture allocator callback.
+     Dav1dLogger logger; ///< Logger callback.
++    Dav1dDXVA dxva; ///< DXVA callbacks
+ } Dav1dSettings;
+ 
+ /**
+diff --git a/include/dav1d/dxva_av1.h b/include/dav1d/dxva_av1.h
+new file mode 100644
+index 0000000..0a37c41
+--- /dev/null
++++ b/include/dav1d/dxva_av1.h
+@@ -0,0 +1,302 @@
++//------------------------------------------------------------------------------
++// File: DXVA_AV1.h
++//
++// Desc: DirectX Video Acceleration header file.
++// This file is a copied excerpt of the Win32 API, defining DXVA
++//  structures necessary for decode of AV1 bitstreams.
++//
++// Copyright (c) 2020, Microsoft Corporation.
++//------------------------------------------------------------------------------
++
++#ifndef _DIRECTX_AV1_VA_
++#define _DIRECTX_AV1_VA_
++
++#if defined(_WIN32)
++
++#include <windows.h>
++#include <dxva.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/* AV1 picture entry data structure */
++typedef struct _DXVA_PicEntry_AV1 {
++
++    UINT width;
++    UINT height;
++
++    // Global motion parameters
++    INT wmmat[6];
++    union {
++        struct {
++            UCHAR wminvalid : 1;
++            UCHAR wmtype : 2;
++            UCHAR Reserved : 5;
++        };
++        UCHAR GlobalMotionFlags;
++    } DUMMYUNIONNAME;
++
++    UCHAR Index;
++    UINT16 Reserved16Bits;
++
++} DXVA_PicEntry_AV1, * LPDXVA_PicEntry_AV1;
++
++/* AV1 picture parameters structure */
++typedef struct _DXVA_PicParams_AV1 {
++    UINT width;
++    UINT height;
++
++    UINT max_width;
++    UINT max_height;
++
++    UCHAR CurrPicTextureIndex;
++    UCHAR superres_denom;
++    UCHAR bitdepth;
++    UCHAR seq_profile;
++
++    // Tiles:
++    struct {
++        UCHAR cols;
++        UCHAR rows;
++        USHORT context_update_id;
++        USHORT widths[64];
++        USHORT heights[64];
++    } tiles;
++
++    // Coding Tools
++    union {
++        struct {
++            UINT use_128x128_superblock : 1;
++            UINT intra_edge_filter : 1;
++            UINT interintra_compound : 1;
++            UINT masked_compound : 1;
++            UINT warped_motion : 1;
++            UINT dual_filter : 1;
++            UINT jnt_comp : 1;
++            UINT screen_content_tools : 1;
++            UINT integer_mv : 1;
++            UINT cdef : 1;
++            UINT restoration : 1;
++            UINT film_grain : 1;
++            UINT intrabc : 1;
++            UINT high_precision_mv : 1;
++            UINT switchable_motion_mode : 1;
++            UINT filter_intra : 1;
++            UINT disable_frame_end_update_cdf : 1;
++            UINT disable_cdf_update : 1;
++            UINT reference_mode : 1;
++            UINT skip_mode : 1;
++            UINT reduced_tx_set : 1;
++            UINT superres : 1;
++            UINT tx_mode : 2;
++            UINT use_ref_frame_mvs : 1;
++            UINT enable_ref_frame_mvs : 1;
++            UINT reference_frame_update : 1;
++            UINT Reserved : 5;
++        };
++        UINT32 CodingParamToolFlags;
++    } coding;
++
++    // Format & Picture Info flags
++    union {
++        struct {
++            UCHAR frame_type : 2;
++            UCHAR show_frame : 1;
++            UCHAR showable_frame : 1;
++            UCHAR subsampling_x : 1;
++            UCHAR subsampling_y : 1;
++            UCHAR mono_chrome : 1;
++            UCHAR Reserved : 1;
++        };
++        UCHAR FormatAndPictureInfoFlags;
++    } format;
++
++    // References
++    UCHAR primary_ref_frame;
++    UCHAR order_hint;
++    UCHAR order_hint_bits;
++
++    DXVA_PicEntry_AV1 frame_refs[7];
++    UCHAR RefFrameMapTextureIndex[8];
++
++    // Loop filter parameters
++    struct {
++        UCHAR filter_level[2];
++        UCHAR filter_level_u;
++        UCHAR filter_level_v;
++
++        UCHAR sharpness_level;
++        union {
++            struct {
++                UCHAR mode_ref_delta_enabled : 1;
++                UCHAR mode_ref_delta_update : 1;
++                UCHAR delta_lf_multi : 1;
++                UCHAR delta_lf_present : 1;
++                UCHAR Reserved : 4;
++            };
++            UCHAR ControlFlags;
++        } DUMMYUNIONNAME;
++        CHAR ref_deltas[8];
++        CHAR mode_deltas[2];
++        UCHAR delta_lf_res;
++        UCHAR frame_restoration_type[3];
++        USHORT log2_restoration_unit_size[3];
++        UINT16 Reserved16Bits;
++    } loop_filter;
++
++    // Quantization
++    struct {
++        union {
++            struct {
++                UCHAR delta_q_present : 1;
++                UCHAR delta_q_res : 2;
++                UCHAR Reserved : 5;
++            };
++            UCHAR ControlFlags;
++        } DUMMYUNIONNAME;
++
++        UCHAR base_qindex;
++        CHAR y_dc_delta_q;
++        CHAR u_dc_delta_q;
++        CHAR v_dc_delta_q;
++        CHAR u_ac_delta_q;
++        CHAR v_ac_delta_q;
++        // using_qmatrix:
++        UCHAR qm_y;
++        UCHAR qm_u;
++        UCHAR qm_v;
++        UINT16 Reserved16Bits;
++    } quantization;
++
++    // Cdef parameters
++    struct {
++        union {
++            struct {
++                UCHAR damping : 2;
++                UCHAR bits : 2;
++                UCHAR Reserved : 4;
++            };
++            UCHAR ControlFlags;
++        } DUMMYUNIONNAME;
++
++        union {
++            struct {
++                UCHAR primary : 6;
++                UCHAR secondary : 2;
++            };
++            UCHAR combined;
++        } y_strengths[8];
++
++        union {
++            struct {
++                UCHAR primary : 6;
++                UCHAR secondary : 2;
++            };
++            UCHAR combined;
++        } uv_strengths[8];
++
++    } cdef;
++
++    UCHAR interp_filter;
++
++    // Segmentation
++    struct {
++        union {
++            struct {
++                UCHAR enabled : 1;
++                UCHAR update_map : 1;
++                UCHAR update_data : 1;
++                UCHAR temporal_update : 1;
++                UCHAR Reserved : 4;
++            };
++            UCHAR ControlFlags;
++        } DUMMYUNIONNAME;
++        UCHAR Reserved24Bits[3];
++
++        union {
++            struct {
++                UCHAR alt_q : 1;
++                UCHAR alt_lf_y_v : 1;
++                UCHAR alt_lf_y_h : 1;
++                UCHAR alt_lf_u : 1;
++                UCHAR alt_lf_v : 1;
++                UCHAR ref_frame : 1;
++                UCHAR skip : 1;
++                UCHAR globalmv : 1;
++            };
++            UCHAR mask;
++        } feature_mask[8];
++
++        SHORT feature_data[8][8];
++
++    } segmentation;
++
++    struct {
++        union {
++            struct {
++                USHORT apply_grain : 1;
++                USHORT scaling_shift_minus8 : 2;
++                USHORT chroma_scaling_from_luma : 1;
++                USHORT ar_coeff_lag : 2;
++                USHORT ar_coeff_shift_minus6 : 2;
++                USHORT grain_scale_shift : 2;
++                USHORT overlap_flag : 1;
++                USHORT clip_to_restricted_range : 1;
++                USHORT matrix_coeff_is_identity : 1;
++                USHORT Reserved : 3;
++            };
++            USHORT ControlFlags;
++        } DUMMYUNIONNAME;
++
++        USHORT grain_seed;
++        UCHAR scaling_points_y[14][2];
++        UCHAR num_y_points;
++        UCHAR scaling_points_cb[10][2];
++        UCHAR num_cb_points;
++        UCHAR scaling_points_cr[10][2];
++        UCHAR num_cr_points;
++        UCHAR ar_coeffs_y[24];
++        UCHAR ar_coeffs_cb[25];
++        UCHAR ar_coeffs_cr[25];
++        UCHAR cb_mult;
++        UCHAR cb_luma_mult;
++        UCHAR cr_mult;
++        UCHAR cr_luma_mult;
++        UCHAR Reserved8Bits;
++        SHORT cb_offset;
++        SHORT cr_offset;
++    } film_grain;
++
++    UINT   Reserved32Bits;
++    UINT   StatusReportFeedbackNumber;
++} DXVA_PicParams_AV1, * LPDXVA_PicParams_AV1;
++
++/* AV1 tile structure */
++typedef struct _DXVA_Tile_AV1 {
++    UINT   DataOffset;
++    UINT   DataSize;
++    USHORT row;
++    USHORT column;
++    UINT16 Reserved16Bits;
++    UCHAR anchor_frame;
++    UCHAR Reserved8Bits;
++} DXVA_Tile_AV1, * LPDXVA_Tile_AV1;
++
++/* AV1 status reporting data structure */
++typedef struct _DXVA_Status_AV1 {
++    UINT  StatusReportFeedbackNumber;
++    DXVA_PicEntry_AV1 CurrPic;
++    UCHAR  BufType;
++    UCHAR  Status;
++    UCHAR  Reserved8Bits;
++    USHORT NumMbsAffected;
++} DXVA_Status_AV1, * LPDXVA_Status_AV1;
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif // defined(_WIN32)
++
++#endif // _DIRECTX_AV1_VA_
+diff --git a/include/dav1d/meson.build b/include/dav1d/meson.build
+index b5649d3..275c8a2 100644
+--- a/include/dav1d/meson.build
++++ b/include/dav1d/meson.build
+@@ -31,11 +31,21 @@ version_h_target = configure_file(input: 'version.h.in',
+                                   output: 'version.h',
+                                   configuration: version_h_data)
+ 
++dav1d_api_headers = files(
++    'common.h',
++    'data.h',
++    'dav1d.h',
++    'headers.h',
++    'picture.h',
++    )
++
++if host_machine.system() == 'windows'
++    dav1d_api_headers += files(
++        'dxva_av1.h',
++    )
++endif
++
+ # install headers
+-install_headers('common.h',
+-                'data.h',
+-                'dav1d.h',
+-                'headers.h',
+-                'picture.h',
++install_headers(dav1d_api_headers,
+                 version_h_target,
+                 subdir : 'dav1d')
+diff --git a/include/dav1d/picture.h b/include/dav1d/picture.h
+index 98e5eb5..3e75607 100644
+--- a/include/dav1d/picture.h
++++ b/include/dav1d/picture.h
+@@ -93,6 +93,13 @@ typedef struct Dav1dPicture {
+     struct Dav1dRef *ref; ///< Frame data allocation origin
+ 
+     void *allocator_data; ///< pointer managed by the allocator
++
++    /**
++     * Used to manage DXVA texture array index when decoding with DXVA,
++     * the allocator should put an index here that will be used to fill out
++     * DXVA_PicParams_AV1::ref_frame_map_texture_index
++     */
++    uint16_t dxva_picture_index;
+ } Dav1dPicture;
+ 
+ typedef struct Dav1dPicAllocator {
+diff --git a/meson.build b/meson.build
+index d5366f9..defda53 100644
+--- a/meson.build
++++ b/meson.build
+@@ -30,7 +30,7 @@ project('dav1d', ['c'],
+                       'b_ndebug=if-release'],
+     meson_version: '>= 0.47.0')
+ 
+-dav1d_soname_version       = '4.0.2'
++dav1d_soname_version       = '5.0.0'
+ dav1d_api_version_array    = dav1d_soname_version.split('.')
+ dav1d_api_version_major    = dav1d_api_version_array[0]
+ dav1d_api_version_minor    = dav1d_api_version_array[1]
+@@ -156,6 +156,10 @@ if host_machine.system() == 'linux'
+     endif
+ endif
+ 
++d3d11_dependency = []
++if host_machine.system() == 'windows'
++    d3d11_dependency = cc.find_library('d3d11', required: true)
++endif
+ 
+ # Header checks
+ 
+diff --git a/src/decode.c b/src/decode.c
+index f678215..345f23e 100644
+--- a/src/decode.c
++++ b/src/decode.c
+@@ -50,6 +50,7 @@
+ #include "src/tables.h"
+ #include "src/thread_task.h"
+ #include "src/warpmv.h"
++#include "src/dxva.h"
+ 
+ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
+                               const Dav1dFrameHeader *const frame_hdr,
+@@ -3441,6 +3442,15 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+         f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+     }
+ 
++#ifdef _WIN32
++    // allocate and fill DXVA structures if using DXVA
++    if (c->dxva.decode_callback)
++    {
++        res = dav1d_allocate_frame_dxva(f, c);
++        if (res < 0) goto error;
++    }
++#endif // _WIN32
++
+     // move f->cur into output queue
+     if (c->n_fc == 1) {
+         if (f->frame_hdr->show_frame)
+@@ -3585,8 +3595,21 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+         }
+     }
+ 
+-    if (c->n_fc == 1) {
+-        if ((res = dav1d_decode_frame(f)) < 0) {
++    // FIXME currently DXVA based decode is always single-threaded
++    //  In DX12 it is possible to schedule DXVA decode multi-threaded.
++    //  This is not relevant to DX11/DX9 as those only support single-threaded operation.
++    if (c->dxva.decode_callback || c->n_fc == 1) {
++#ifdef _WIN32
++        if (c->dxva.decode_callback)
++        {
++            res = dav1d_decode_frame_dxva(f);
++        }
++        else
++#endif // _WIN32
++        {
++            res = dav1d_decode_frame(f);
++        }
++        if (res < 0) {
+             dav1d_picture_unref_internal(&c->out);
+             for (int i = 0; i < 8; i++) {
+                 if (refresh_frame_flags & (1 << i)) {
+diff --git a/src/dxva.c b/src/dxva.c
+new file mode 100644
+index 0000000..4d39fc9
+--- /dev/null
++++ b/src/dxva.c
+@@ -0,0 +1,432 @@
++/*
++ * Copyright � 2020, VideoLAN and dav1d authors
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ *    list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ *    this list of conditions and the following disclaimer in the documentation
++ *    and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include "config.h"
++
++#include <errno.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#include "dav1d/headers.h"
++#include "common/intops.h"
++#include "common/mem.h"
++#include "common/validate.h"
++
++#if defined(_WIN32)
++
++#include "src/internal.h"
++#include "src/log.h"
++#include "src/picture.h"
++#include "src/ref.h"
++#include "src/dxva.h"
++#include "dav1d/dxva_av1.h"
++
++#define DXVA_INVALID_PICTURE_INDEX 0xFFu
++#define DXVA_INVALID_QM 0xFFu
++
++static int dav1d_restoration_type_to_dxva(const enum Dav1dRestorationType type) {
++    switch (type) {
++    case DAV1D_RESTORATION_NONE:       return 0;
++    case DAV1D_RESTORATION_SWITCHABLE: return 3;
++    case DAV1D_RESTORATION_WIENER:     return 1;
++    case DAV1D_RESTORATION_SGRPROJ:    return 2;
++    }
++    return 0;
++}
++
++static int fill_picparams_struct(DXVA_PicParams_AV1* params, const Dav1dFrameContext *fc, Dav1dContext *const c) {
++    params->CurrPicTextureIndex = (unsigned char)fc->cur.dxva_picture_index;
++
++    // Basics
++    const int bitdepthTranslation[] = { 8,10,12 };
++    params->width = fc->frame_hdr->width[0];
++    params->height = fc->frame_hdr->height;
++    params->max_width = fc->seq_hdr->max_width;
++    params->max_height = fc->seq_hdr->max_height;
++
++    params->superres_denom = fc->frame_hdr->super_res.width_scale_denominator;
++
++    if (fc->seq_hdr->hbd > 2)
++        return DAV1D_ERR(EINVAL);
++
++    params->bitdepth = bitdepthTranslation[fc->seq_hdr->hbd];
++    params->seq_profile = fc->seq_hdr->profile;
++    params->interp_filter = (int)fc->frame_hdr->subpel_filter_mode; // enum same as dxva
++
++    // Tiles
++    params->tiles.cols = fc->frame_hdr->tiling.cols;
++    params->tiles.rows = fc->frame_hdr->tiling.rows;
++    params->tiles.context_update_id = fc->frame_hdr->tiling.update;
++
++    // AV1 DXVA defines tiles in terms of tile width / height in SBs.
++    // dAV1d specifies the start offset of each tile
++    uint16_t last = fc->frame_hdr->tiling.col_start_sb[0];
++    for (int w = 1; w <= fc->frame_hdr->tiling.cols; w++) {
++        params->tiles.widths[w-1] = fc->frame_hdr->tiling.col_start_sb[w] - last;
++        last = fc->frame_hdr->tiling.col_start_sb[w];
++    }
++    last = fc->frame_hdr->tiling.row_start_sb[0];
++    for (int h = 1; h <= fc->frame_hdr->tiling.rows; h++) {
++        params->tiles.heights[h-1] = fc->frame_hdr->tiling.row_start_sb[h] - last;
++        last = fc->frame_hdr->tiling.row_start_sb[h];
++    }
++
++    // Coding Tools
++    params->coding.use_128x128_superblock = fc->seq_hdr->sb128;
++    params->coding.intra_edge_filter = fc->seq_hdr->intra_edge_filter;
++    params->coding.interintra_compound = fc->seq_hdr->inter_intra;
++    params->coding.masked_compound = fc->seq_hdr->masked_compound;
++    params->coding.warped_motion = fc->frame_hdr->warp_motion;
++    params->coding.dual_filter = fc->seq_hdr->dual_filter;
++    params->coding.jnt_comp = fc->seq_hdr->jnt_comp;
++    params->coding.screen_content_tools = fc->frame_hdr->allow_screen_content_tools;
++    params->coding.integer_mv = fc->frame_hdr->force_integer_mv;
++    params->coding.cdef = fc->seq_hdr->cdef;
++    params->coding.restoration = fc->seq_hdr->restoration;
++    params->coding.film_grain = fc->seq_hdr->film_grain_present;
++    params->coding.intrabc = fc->frame_hdr->allow_intrabc;
++    params->coding.high_precision_mv = fc->frame_hdr->hp;
++    params->coding.switchable_motion_mode = fc->frame_hdr->switchable_motion_mode;
++    params->coding.filter_intra = fc->seq_hdr->filter_intra;
++    params->coding.disable_frame_end_update_cdf = !fc->frame_hdr->refresh_context;
++    params->coding.disable_cdf_update = fc->frame_hdr->disable_cdf_update;
++    params->coding.reference_mode = fc->frame_hdr->switchable_comp_refs;
++    params->coding.skip_mode = fc->frame_hdr->skip_mode_enabled;
++    params->coding.reduced_tx_set = fc->frame_hdr->reduced_txtp_set;
++    params->coding.superres = fc->frame_hdr->super_res.enabled;
++    params->coding.tx_mode = (int)fc->frame_hdr->txfm_mode; // enum same as dxva
++    params->coding.use_ref_frame_mvs = fc->frame_hdr->use_ref_frame_mvs;
++    params->coding.enable_ref_frame_mvs = fc->seq_hdr->ref_frame_mvs;
++    params->coding.reference_frame_update = !(fc->frame_hdr->show_existing_frame == 1 && fc->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY);
++
++    // Format & Picture Info flags
++    params->format.frame_type = (int)fc->frame_hdr->frame_type; // enum is same as dxva
++    params->format.show_frame = fc->frame_hdr->show_frame;
++    params->format.showable_frame = fc->frame_hdr->showable_frame;
++    switch (fc->cur.p.layout) {
++    case DAV1D_PIXEL_LAYOUT_I400:
++    case DAV1D_PIXEL_LAYOUT_I420:
++        params->format.subsampling_x = 1;
++        params->format.subsampling_y = 1;
++        break;
++    case DAV1D_PIXEL_LAYOUT_I422:
++        params->format.subsampling_x = 1;
++        params->format.subsampling_y = 0;
++        break;
++    case DAV1D_PIXEL_LAYOUT_I444:
++        params->format.subsampling_x = 0;
++        params->format.subsampling_y = 0;
++        break;
++    default:
++        return DAV1D_ERR(EINVAL);
++    }
++    params->format.mono_chrome = fc->seq_hdr->monochrome;
++
++    // References
++    params->order_hint = fc->frame_hdr->frame_offset;
++    params->order_hint_bits = fc->seq_hdr->order_hint_n_bits;
++
++    params->primary_ref_frame = fc->frame_hdr->primary_ref_frame;
++
++    memset(params->RefFrameMapTextureIndex, 0xFF, sizeof(params->RefFrameMapTextureIndex));
++    for (int i = 0; i < DAV1D_REFS_PER_FRAME; i++) {
++        const int idx = fc->frame_hdr->refidx[i];
++        if (c->refs[idx].p.p.data[0]) {
++            params->frame_refs[i].Index = idx;
++        } else {
++            params->frame_refs[i].Index = DXVA_INVALID_PICTURE_INDEX;
++        }
++        memcpy(params->frame_refs[i].wmmat, fc->frame_hdr->gmv[i].matrix, 6 * sizeof(int));
++        params->frame_refs[i].wmtype = (int)fc->frame_hdr->gmv[i].type; // enum same as dxva
++        params->frame_refs[i].wminvalid = fc->frame_hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY;
++        params->frame_refs[i].width = c->refs[idx].p.p.p.w;
++        params->frame_refs[i].height = c->refs[idx].p.p.p.h;
++    }
++    for (int i = 0; i < 8; i++) {
++        params->RefFrameMapTextureIndex[i] = (uint8_t)c->refs[i].p.p.dxva_picture_index;
++    }
++
++    // Loop filter parameters
++    params->loop_filter.filter_level[0] = fc->frame_hdr->loopfilter.level_y[0];
++    params->loop_filter.filter_level[1] = fc->frame_hdr->loopfilter.level_y[1];
++    params->loop_filter.filter_level_u = fc->frame_hdr->loopfilter.level_u;
++    params->loop_filter.filter_level_v = fc->frame_hdr->loopfilter.level_v;
++    params->loop_filter.sharpness_level = fc->frame_hdr->loopfilter.sharpness;
++    params->loop_filter.mode_ref_delta_enabled = fc->frame_hdr->loopfilter.mode_ref_delta_enabled;
++    params->loop_filter.mode_ref_delta_update = fc->frame_hdr->loopfilter.mode_ref_delta_update;
++    params->loop_filter.delta_lf_multi = fc->frame_hdr->delta.lf.multi;
++    params->loop_filter.delta_lf_present = fc->frame_hdr->delta.lf.present;
++    params->loop_filter.mode_deltas[0] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[0];
++    params->loop_filter.mode_deltas[1] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[1];
++    for (int i = 0; i < DAV1D_TOTAL_REFS_PER_FRAME; i++) {
++        params->loop_filter.ref_deltas[i] = fc->frame_hdr->loopfilter.mode_ref_deltas.ref_delta[i];
++    }
++    params->loop_filter.delta_lf_res = fc->frame_hdr->delta.lf.res_log2;
++    char haverestoration = fc->frame_hdr->restoration.type[0] || fc->frame_hdr->restoration.type[1] || fc->frame_hdr->restoration.type[2];
++    for (int i = 0; i < 3; i++) {
++        params->loop_filter.frame_restoration_type[i] = dav1d_restoration_type_to_dxva(fc->frame_hdr->restoration.type[i]);
++        params->loop_filter.log2_restoration_unit_size[i] = haverestoration ? fc->frame_hdr->restoration.unit_size[min(i, 1)] : 8; // dav1d only tracks y and uv not y,u,v
++    }
++
++    // Quantization
++    params->quantization.delta_q_present = fc->frame_hdr->delta.q.present;
++    params->quantization.delta_q_res = fc->frame_hdr->delta.q.res_log2;
++    params->quantization.base_qindex = fc->frame_hdr->quant.yac;
++    params->quantization.y_dc_delta_q = fc->frame_hdr->quant.ydc_delta;
++    params->quantization.u_dc_delta_q = fc->frame_hdr->quant.udc_delta;
++    params->quantization.v_dc_delta_q = fc->frame_hdr->quant.vdc_delta;
++    params->quantization.u_ac_delta_q = fc->frame_hdr->quant.uac_delta;
++    params->quantization.v_ac_delta_q = fc->frame_hdr->quant.vac_delta;
++    if (fc->frame_hdr->quant.qm)
++    {
++        params->quantization.qm_y = fc->frame_hdr->quant.qm_y;
++        params->quantization.qm_u = fc->frame_hdr->quant.qm_u;
++        params->quantization.qm_v = fc->frame_hdr->quant.qm_v;
++    }
++    else
++    {
++        params->quantization.qm_y = DXVA_INVALID_QM;
++        params->quantization.qm_u = DXVA_INVALID_QM;
++        params->quantization.qm_v = DXVA_INVALID_QM;
++    }
++
++    // Cdef parameters
++    params->cdef.damping = fc->frame_hdr->cdef.damping - 3;
++    params->cdef.bits = fc->frame_hdr->cdef.n_bits;
++    for (int i = 0; i < DAV1D_MAX_CDEF_STRENGTHS; i++) {
++        params->cdef.y_strengths[i].primary = fc->frame_hdr->cdef.y_strength[i] >> 2;
++        params->cdef.y_strengths[i].secondary = fc->frame_hdr->cdef.y_strength[i] & 0x3;
++        params->cdef.uv_strengths[i].primary = fc->frame_hdr->cdef.uv_strength[i] >> 2;
++        params->cdef.uv_strengths[i].secondary = fc->frame_hdr->cdef.uv_strength[i] & 0x3;
++    }
++
++    // Segmentation
++    params->segmentation.enabled = fc->frame_hdr->segmentation.enabled;
++    params->segmentation.update_map = fc->frame_hdr->segmentation.update_map;
++    params->segmentation.update_data = fc->frame_hdr->segmentation.update_data;
++    params->segmentation.temporal_update = fc->frame_hdr->segmentation.temporal;
++    for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
++        params->segmentation.feature_mask[i].alt_q = fc->frame_hdr->segmentation.seg_data.d[i].delta_q != 0;
++        params->segmentation.feature_data[i][0] = fc->frame_hdr->segmentation.seg_data.d[i].delta_q;
++
++        params->segmentation.feature_mask[i].alt_lf_y_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v != 0;
++        params->segmentation.feature_data[i][1] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v;
++
++        params->segmentation.feature_mask[i].alt_lf_y_h = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h != 0;
++        params->segmentation.feature_data[i][2] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h;
++
++        params->segmentation.feature_mask[i].alt_lf_u = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u != 0;
++        params->segmentation.feature_data[i][3] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u;
++
++        params->segmentation.feature_mask[i].alt_lf_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v != 0;
++        params->segmentation.feature_data[i][4] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v;
++
++        params->segmentation.feature_mask[i].ref_frame = fc->frame_hdr->segmentation.seg_data.d[i].ref != -1;
++        params->segmentation.feature_data[i][5] = fc->frame_hdr->segmentation.seg_data.d[i].ref == -1 ? 0 : fc->frame_hdr->segmentation.seg_data.d[i].ref;
++
++        params->segmentation.feature_mask[i].skip = fc->frame_hdr->segmentation.seg_data.d[i].skip != 0;
++        params->segmentation.feature_data[i][6] = fc->frame_hdr->segmentation.seg_data.d[i].skip;
++
++        params->segmentation.feature_mask[i].globalmv = fc->frame_hdr->segmentation.seg_data.d[i].globalmv != 0;
++        params->segmentation.feature_data[i][7] = fc->frame_hdr->segmentation.seg_data.d[i].globalmv;
++    }
++
++    // Film Grain
++    if (fc->frame_hdr->film_grain.present) {
++        params->film_grain.apply_grain = 1;
++        params->film_grain.scaling_shift_minus8 = fc->frame_hdr->film_grain.data.scaling_shift - 8;
++        params->film_grain.chroma_scaling_from_luma = fc->frame_hdr->film_grain.data.chroma_scaling_from_luma;
++        params->film_grain.ar_coeff_lag = fc->frame_hdr->film_grain.data.ar_coeff_lag;
++        params->film_grain.ar_coeff_shift_minus6 = (uint16_t)(fc->frame_hdr->film_grain.data.ar_coeff_shift - 6);
++        params->film_grain.grain_scale_shift = fc->frame_hdr->film_grain.data.grain_scale_shift;
++        params->film_grain.overlap_flag = fc->frame_hdr->film_grain.data.overlap_flag;
++        params->film_grain.clip_to_restricted_range = fc->frame_hdr->film_grain.data.clip_to_restricted_range;
++        params->film_grain.matrix_coeff_is_identity = fc->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
++        params->film_grain.grain_seed = fc->frame_hdr->film_grain.data.seed;
++        memcpy(params->film_grain.scaling_points_y, fc->frame_hdr->film_grain.data.y_points, 14 * 2);
++        params->film_grain.num_y_points = fc->frame_hdr->film_grain.data.num_y_points;
++        memcpy(params->film_grain.scaling_points_cb, fc->frame_hdr->film_grain.data.uv_points[0], 10 * 2);
++        params->film_grain.num_cb_points = fc->frame_hdr->film_grain.data.num_uv_points[0];
++        memcpy(params->film_grain.scaling_points_cr, fc->frame_hdr->film_grain.data.uv_points[1], 10 * 2);
++        params->film_grain.num_cr_points = fc->frame_hdr->film_grain.data.num_uv_points[1];
++        for (int i = 0; i < 24; i++) {
++            params->film_grain.ar_coeffs_y[i] = (UCHAR)((int)fc->frame_hdr->film_grain.data.ar_coeffs_y[i] + 128);
++        }
++        for (int i = 0; i < 25; i++) {
++            params->film_grain.ar_coeffs_cb[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[0][i] + 128;
++            params->film_grain.ar_coeffs_cr[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[1][i] + 128;
++        }
++        params->film_grain.cb_mult = fc->frame_hdr->film_grain.data.uv_mult[0] + 128;
++        params->film_grain.cb_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[0] + 128;
++        params->film_grain.cr_mult = fc->frame_hdr->film_grain.data.uv_mult[1] + 128;
++        params->film_grain.cr_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[1] + 128;
++        params->film_grain.cb_offset = fc->frame_hdr->film_grain.data.uv_offset[0] + 256;
++        params->film_grain.cr_offset = fc->frame_hdr->film_grain.data.uv_offset[1] + 256;
++    }
++
++    return 0;
++}
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c) {
++    int retval = DAV1D_ERR(ENOMEM);
++    DXVA_PicParams_AV1 *params = NULL;
++    DXVA_Tile_AV1 *tiles = NULL;
++    int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++    if ((retval = c->dxva.alloc_callback(c->dxva.cookie, &f->cur, &params, &tiles, tile_count)) < 0)
++        return retval;
++
++    if ((retval = fill_picparams_struct(params, f, c)) < 0)
++        goto error;
++
++    f->dxva_params = params;
++    f->dxva_tiles = tiles;
++    return 0;
++
++error:
++    c->dxva.release_callback(c->dxva.cookie, params, tiles);
++    return retval;
++}
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f) {
++    int retval = DAV1D_ERR(ENOMEM);
++    uint8_t *bitstream = NULL;
++    int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++    // Construct tile list
++    //  Tiles are copied into GPU memory as one contiguous block, with each
++    //  tile having an entry in the tile list (DXVA_Tile_AV1); specifying its offset
++    //  into the GPU buffer and its size.
++    int tile_row = 0, tile_col = 0, tile_index = 0;
++    size_t total_data_offset = 0;
++    DXVA_Tile_AV1 *current_tile = f->dxva_tiles;
++    for (int i = 0; i < f->n_tile_data; i++) {
++        const uint8_t *data = f->tile[i].data.data;
++        size_t size = f->tile[i].data.sz;
++
++        for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
++            size_t tile_sz;
++            if (j == f->tile[i].end) {
++                tile_sz = size;
++            } else {
++                if (f->frame_hdr->tiling.n_bytes > size)
++                    goto error;
++                tile_sz = 0;
++                for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++) {
++                    tile_sz |= (unsigned)*data++ << (k * 8);
++                    total_data_offset++;
++                }
++                tile_sz++;
++                size -= f->frame_hdr->tiling.n_bytes;
++                if (tile_sz > size)
++                    goto error;
++            }
++
++            if (tile_index > tile_count)
++                goto error;
++
++            current_tile->DataOffset = (uint32_t)total_data_offset;
++            current_tile->DataSize = (uint32_t)tile_sz;
++            current_tile->row = tile_row;
++            current_tile->column = tile_col++;
++            // large scale tile decoding process is not supported
++            current_tile->anchor_frame = DXVA_INVALID_PICTURE_INDEX;
++
++            if (tile_col == f->frame_hdr->tiling.cols) {
++                tile_col = 0;
++                tile_row++;
++            }
++
++            total_data_offset += tile_sz;
++            size -= tile_sz;
++            data += tile_sz;
++            tile_index++;
++            current_tile++;
++        }
++    }
++
++    if ((retval = f->c->dxva.decode_callback(f->c->dxva.cookie, f->dxva_params, f->dxva_tiles, tile_count, &f->cur, f->tile, f->n_tile_data)) < 0)
++        goto error;
++
++    retval = 0;
++
++error:
++    f->c->dxva.release_callback(f->c->dxva.cookie, f->dxva_params, f->dxva_tiles);
++
++    for (int i = 0; i < 7; i++) {
++        if (f->refp[i].p.data[0])
++            dav1d_thread_picture_unref(&f->refp[i]);
++        dav1d_ref_dec(&f->ref_mvs_ref[i]);
++    }
++
++    dav1d_picture_unref_internal(&f->cur);
++    dav1d_thread_picture_unref(&f->sr_cur);
++    dav1d_cdf_thread_unref(&f->in_cdf);
++    if (f->frame_hdr->refresh_context) {
++        dav1d_cdf_thread_signal(&f->out_cdf);
++        dav1d_cdf_thread_unref(&f->out_cdf);
++    }
++    dav1d_ref_dec(&f->cur_segmap_ref);
++    dav1d_ref_dec(&f->prev_segmap_ref);
++    dav1d_ref_dec(&f->mvs_ref);
++    dav1d_ref_dec(&f->seq_hdr_ref);
++    dav1d_ref_dec(&f->frame_hdr_ref);
++
++    for (int i = 0; i < f->n_tile_data; i++)
++        dav1d_data_unref_internal(&f->tile[i].data);
++
++    return retval;
++}
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++    *pic = (DXVA_PicParams_AV1*)calloc(1, sizeof(DXVA_PicParams_AV1));
++    if (!*pic) return DAV1D_ERR(ENOMEM);
++    *tiles = (DXVA_Tile_AV1*)calloc(n_tiles, sizeof(DXVA_Tile_AV1));
++    if (!*tiles) return DAV1D_ERR(ENOMEM);
++    return 0;
++}
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++    free(pic);
++    free(tiles);
++}
++
++#else // defined(_WIN32)
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++    return DAV1D_ERR(EINVAL);
++}
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++}
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f) {
++    return DAV1D_ERR(EINVAL);
++}
++
++#endif
+diff --git a/src/dxva.h b/src/dxva.h
+new file mode 100644
+index 0000000..b695a9e
+--- /dev/null
++++ b/src/dxva.h
+@@ -0,0 +1,44 @@
++/*
++ * Copyright � 2020, VideoLAN and dav1d authors
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ *    list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ *    this list of conditions and the following disclaimer in the documentation
++ *    and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef DAV1D_DXVA_H
++#define DAV1D_DXVA_H
++
++#include <stddef.h>
++#include <stdint.h>
++
++#include "common.h"
++#include "headers.h"
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c);
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f);
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles);
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles);
++
++#endif /* DAV1D_DXVA_H */
+diff --git a/src/internal.h b/src/internal.h
+index 07f5676..043d28c 100644
+--- a/src/internal.h
++++ b/src/internal.h
+@@ -67,11 +67,6 @@ typedef struct Dav1dDSPContext {
+     Dav1dLoopRestorationDSPContext lr;
+ } Dav1dDSPContext;
+ 
+-struct Dav1dTileGroup {
+-    Dav1dData data;
+-    int start, end;
+-};
+-
+ struct Dav1dContext {
+     Dav1dFrameContext *fc;
+     unsigned n_fc;
+@@ -135,6 +130,7 @@ struct Dav1dContext {
+     int drain;
+ 
+     Dav1dLogger logger;
++    Dav1dDXVA dxva;
+ };
+ 
+ struct Dav1dFrameContext {
+@@ -157,6 +153,8 @@ struct Dav1dFrameContext {
+     struct Dav1dTileGroup *tile;
+     int n_tile_data_alloc;
+     int n_tile_data;
++    DXVA_PicParams_AV1 *dxva_params; // DXVA
++    DXVA_Tile_AV1 *dxva_tiles; // DXVA
+ 
+     // for scalable references
+     struct ScalableMotionParams {
+diff --git a/src/lib.c b/src/lib.c
+index 82af64a..88f4da2 100644
+--- a/src/lib.c
++++ b/src/lib.c
+@@ -50,6 +50,7 @@
+ #include "src/ref.h"
+ #include "src/thread_task.h"
+ #include "src/wedge.h"
++#include "src/dxva.h"
+ 
+ static COLD void init_internal(void) {
+     dav1d_init_cpu();
+@@ -75,6 +76,15 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
+     s->operating_point = 0;
+     s->all_layers = 1; // just until the tests are adjusted
+     s->frame_size_limit = 0;
++    s->dxva.cookie = NULL;
++    s->dxva.decode_callback = NULL;
++#ifdef _WIN32
++    s->dxva.alloc_callback = dav1d_default_dxva_alloc;
++    s->dxva.release_callback = dav1d_default_dxva_release;
++#else // !_WIN32
++    s->dxva.alloc_callback = NULL;
++    s->dxva.release_callback = NULL;
++#endif // !_WIN32
+ }
+ 
+ static void close_internal(Dav1dContext **const c_out, int flush);
+@@ -111,6 +121,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+                           DAV1D_ERR(EINVAL));
+     validate_input_or_ret(s->operating_point >= 0 &&
+                           s->operating_point <= 31, DAV1D_ERR(EINVAL));
++    validate_input_or_ret(s->dxva.decode_callback == NULL ||
++                          s->dxva.alloc_callback != NULL,
++                          DAV1D_ERR(EINVAL));
++    validate_input_or_ret(s->dxva.decode_callback == NULL ||
++                          s->dxva.release_callback != NULL,
++                          DAV1D_ERR(EINVAL));
+ 
+     pthread_attr_t thread_attr;
+     if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+@@ -124,6 +140,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ 
+     c->allocator = s->allocator;
+     c->logger = s->logger;
++    c->dxva = s->dxva;
+     c->apply_grain = s->apply_grain;
+     c->operating_point = s->operating_point;
+     c->all_layers = s->all_layers;
+@@ -142,7 +159,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ 
+     c->frame_thread.flush = &c->frame_thread.flush_mem;
+     atomic_init(c->frame_thread.flush, 0);
+-    c->n_fc = s->n_frame_threads;
++    c->n_fc = s->dxva.decode_callback ? 1 : s->n_frame_threads;
+     c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
+     if (!c->fc) goto error;
+     memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+diff --git a/src/meson.build b/src/meson.build
+index fd8ad02..91cb744 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -50,6 +50,12 @@ libdav1d_sources = files(
+     'wedge.c',
+ )
+ 
++if host_machine.system() == 'windows'
++    libdav1d_sources += files(
++        'dxva.c',
++    )
++endif
++
+ # libdav1d bitdepth source files
+ # These files are compiled for each bitdepth with
+ # `BITDEPTH` defined to the currently built bitdepth.
+diff --git a/tools/dav1d.c b/tools/dav1d.c
+index 4b97a9f..3a121e1 100644
+--- a/tools/dav1d.c
++++ b/tools/dav1d.c
+@@ -57,6 +57,10 @@
+ 
+ #include "dav1d_cli_parse.h"
+ 
++#ifdef _WIN32
++#include "dav1d_cli_dxva.h"
++#endif
++
+ static uint64_t get_time_nanos(void) {
+ #ifdef _WIN32
+     LARGE_INTEGER frequency;
+@@ -150,6 +154,9 @@ int main(const int argc, char *const *const argv) {
+     double i_fps;
+     FILE *frametimes = NULL;
+     const char *version = dav1d_version();
++#ifdef _WIN32
++    Dav1dDXVAInfo dxva;
++#endif
+ 
+     if (strcmp(version, DAV1D_VERSION)) {
+         fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
+@@ -197,6 +204,12 @@ int main(const int argc, char *const *const argv) {
+     if (cli_settings.limit != 0 && cli_settings.limit < total)
+         total = cli_settings.limit;
+ 
++#ifdef _WIN32
++    if (cli_settings.dxva) {
++        dxva_init(&lib_settings, &dxva);
++    }
++#endif
++
+     if ((res = dav1d_open(&c, &lib_settings)))
+         return EXIT_FAILURE;
+ 
+@@ -235,6 +248,11 @@ int main(const int argc, char *const *const argv) {
+             }
+             res = 0;
+         } else {
++#ifdef _WIN32
++            if (cli_settings.dxva) {
++                dxva_lock(&p, &dxva);
++            }
++#endif
+             if (!n_out) {
+                 if ((res = output_open(&out, cli_settings.muxer,
+                                        cli_settings.outputfile,
+@@ -272,6 +290,11 @@ int main(const int argc, char *const *const argv) {
+                 break;
+             }
+         } else {
++#ifdef _WIN32
++            if (cli_settings.dxva) {
++                dxva_lock(&p, &dxva);
++            }
++#endif
+             if (!n_out) {
+                 if ((res = output_open(&out, cli_settings.muxer,
+                                        cli_settings.outputfile,
+diff --git a/tools/dav1d_cli_dxva.c b/tools/dav1d_cli_dxva.c
+new file mode 100644
+index 0000000..7415cd6
+--- /dev/null
++++ b/tools/dav1d_cli_dxva.c
+@@ -0,0 +1,403 @@
++/*
++ * Copyright © 2018, VideoLAN and dav1d authors
++ * Copyright © 2018, Two Orioles, LLC
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ *    list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ *    this list of conditions and the following disclaimer in the documentation
++ *    and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifdef _WIN32
++#define COBJMACROS // this makes COM in C more reasonable
++#include <initguid.h>
++#include "config.h"
++#include "vcs_version.h"
++#include "cli_config.h"
++#include "dav1d_cli_dxva.h"
++#include "dav1d/dxva_av1.h"
++
++#define DXVA_INVALID_PICTURE_INDEX 0xFFu
++
++// These may not be defined depending on the Windows SDK version used
++//  We can just re-define them without issue as they will never change
++#if (WDK_NTDDI_VERSION <= NTDDI_WIN10_19H1)
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile0, 0xb8be4ccb, 0xcf53, 0x46ba, 0x8d, 0x59, 0xd6, 0xb8, 0xa6, 0xda, 0x5d, 0x2a);
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile1, 0x6936ff0f, 0x45b1, 0x4163, 0x9c, 0xc1, 0x64, 0x6e, 0xf6, 0x94, 0x61, 0x08);
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile2, 0x0c5f2aa1, 0xe541, 0x4089, 0xbb, 0x7b, 0x98, 0x11, 0x0a, 0x19, 0xd7, 0xc8);
++DEFINE_GUID(DXVA_ModeAV1_VLD_12bit_Profile2, 0x17127009, 0xa00f, 0x4ce1, 0x99, 0x4e, 0xbf, 0x40, 0x81, 0xf6, 0xf3, 0xf0);
++DEFINE_GUID(DXVA_ModeAV1_VLD_12bit_Profile2_420, 0x2d80bed6, 0x9cac, 0x4835, 0x9e, 0x91, 0x32, 0x7b, 0xbc, 0x4f, 0x9e, 0xe8);
++#endif
++
++// This is an example implementation of DXVA using the DX11 interface.  DX9 is similar to this,
++// while DX12 is very different (most seemingly unused parameters in the callbacks are for DX12).
++// To keep the sample simple, this does not implement other important features like:
++//  HW-DRM, histogram generation, device loss checks, array of textures support,
++//  downsampling, and no-recreate on DRC
++
++// NOTE: this file uses quite a lot of COM, since that is how D3D11 & DXVA are defined
++//       it is highly recommended to use C++ for this, since the code will be cleaner.
++
++int insert_freelist(int index, Dav1dDXVAInfo *dxva)
++{
++    dxva->freeList[dxva->freeListWrite % MAX_SAMPLE_POOL_SIZE] = index;
++    dxva->freeListWrite++;
++    return 0;
++}
++
++int dxva_create_decoder(Dav1dPicture *p, Dav1dDXVAInfo *dxva) {
++    D3D11_VIDEO_DECODER_DESC desc;
++    D3D11_VIDEO_DECODER_CONFIG config = {0};
++    // DXVA needs to know the maximums for this sequence
++    desc.SampleWidth = p->seq_hdr->max_width;
++    desc.SampleHeight = p->seq_hdr->max_height;
++    switch(p->seq_hdr->profile) {
++        case 0:
++            desc.Guid = DXVA_ModeAV1_VLD_Profile0;
++            break;
++        case 1:
++            desc.Guid = DXVA_ModeAV1_VLD_Profile1;
++            break;
++        case 2:
++            desc.Guid = DXVA_ModeAV1_VLD_Profile2;
++            break;
++        default:
++            return DAV1D_ERR(1);
++    }
++    switch(p->seq_hdr->layout) {
++        case DAV1D_PIXEL_LAYOUT_I400:
++            desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_R16_UNORM : DXGI_FORMAT_R8_UNORM;
++            break;
++        case DAV1D_PIXEL_LAYOUT_I420:
++            desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_P016 : DXGI_FORMAT_NV12;
++            break;
++        case DAV1D_PIXEL_LAYOUT_I422:
++            desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_Y216 : DXGI_FORMAT_YUY2;
++            break;
++        case DAV1D_PIXEL_LAYOUT_I444:
++            desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_Y416 : DXGI_FORMAT_AYUV;
++            break;
++        default:
++            return DAV1D_ERR(1);
++    }
++
++    // only re-create everything if configuration changed
++    if(dxva->decoder != NULL && memcmp(&desc, &dxva->currentDecoderDesc, sizeof(desc)) == 0)
++        return 0;
++    config.ConfigBitstreamRaw = 1;
++
++    if (dxva->decoder != NULL)
++        ID3D11VideoDevice_Release(dxva->decoder);
++    if(FAILED(ID3D11VideoDevice_CreateVideoDecoder(dxva->vdevice, &desc, &config, &dxva->decoder)))
++        return DAV1D_ERR(1);
++    dxva->currentDecoderDesc = desc;
++
++    D3D11_TEXTURE2D_DESC texture = {0};
++    texture.Width = p->seq_hdr->max_width;
++    texture.Height = p->seq_hdr->max_height;
++    texture.MipLevels = 1;
++    texture.ArraySize = MAX_SAMPLE_POOL_SIZE;
++    texture.Format = desc.OutputFormat;
++    texture.SampleDesc.Count = 1;
++    texture.Usage = D3D11_USAGE_DEFAULT;
++    // In applications that display video or do further processing on the GPU
++    // you should also be setting D3D11_BIND_SHADER_RESOURCE here.
++    texture.BindFlags = D3D11_BIND_DECODER;
++
++    if (dxva->textures != NULL)
++        ID3D11Texture2D_Release(dxva->textures);
++    if(FAILED(ID3D11Device_CreateTexture2D(dxva->device, &texture, NULL, &dxva->textures)))
++        return DAV1D_ERR(ENOMEM);
++    dxva->currentTextureDesc = texture;
++
++    // This staging texture lets us readback from the GPU
++    texture.ArraySize = 1;
++    texture.Usage = D3D11_USAGE_STAGING;
++    texture.BindFlags = 0;
++    texture.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
++
++    if (dxva->stage != NULL)
++        ID3D11Texture2D_Release(dxva->stage);
++    if(FAILED(ID3D11Device_CreateTexture2D(dxva->device, &texture, NULL, &dxva->stage)))
++        return DAV1D_ERR(ENOMEM);
++
++    dxva->freeListRead = 0;
++    dxva->freeListWrite = 0;
++
++    D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC view = {0};
++    view.DecodeProfile = desc.Guid;
++    view.ViewDimension = D3D11_VDOV_DIMENSION_TEXTURE2D;
++
++    for(int i = 0; i < MAX_SAMPLE_POOL_SIZE; i++) {
++        if (dxva->views[i] != NULL)
++            ID3D11VideoDecoderOutputView_Release(dxva->views[i]);
++        if(FAILED(ID3D11VideoDevice_CreateVideoDecoderOutputView(dxva->vdevice, (ID3D11Resource*)dxva->textures, &view, dxva->views + i)))
++            return DAV1D_ERR(ENOMEM);
++
++        insert_freelist(i, dxva);
++    }
++
++    return 0;
++}
++
++int av1_dxva_alloc_picture(Dav1dPicture *pic, void *cookie) {
++    Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++
++    if(dxva_create_decoder(pic, dxva))
++        return DAV1D_ERR(ENOMEM);
++
++    uint16_t sampleIndex = dxva->freeList[dxva->freeListRead % MAX_SAMPLE_POOL_SIZE];
++    if (dxva->freeListRead == dxva->freeListWrite) // no free samples!
++        return DAV1D_ERR(ENOMEM);
++
++    if(FAILED(ID3D11VideoContext_DecoderBeginFrame(dxva->vcontext, dxva->decoder, dxva->views[sampleIndex], 0, NULL)))
++        return DAV1D_ERR(1);
++
++    dxva->freeListRead++;
++    pic->dxva_picture_index = sampleIndex;
++    pic->data[0] = (void*)(dxva->views + sampleIndex);
++    return 0;
++}
++
++void av1_dxva_release_picture(Dav1dPicture *pic, void *cookie) {
++    Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++
++    if (pic->dxva_picture_index != DXVA_INVALID_PICTURE_INDEX)
++    {
++        insert_freelist(pic->dxva_picture_index, dxva);
++        pic->dxva_picture_index = DXVA_INVALID_PICTURE_INDEX;
++    }
++
++    if(pic->data[0] != NULL) {
++        dxva->defaultAllocator.release_picture_callback(pic, dxva->defaultAllocator.cookie);
++        ID3D11DeviceContext_Unmap(dxva->context, (ID3D11Resource*)dxva->stage, 0);
++        pic->data[0] = 0;
++    }
++}
++
++int av1_dxva_decode(void *cookie, DXVA_PicParams_AV1 *picture_parameters, DXVA_Tile_AV1 *tiles, const int n_tiles, Dav1dPicture *output_picture, Dav1dTileGroup* tile_groups, int tile_group_count) {
++    Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++    uint8_t* bitstream_target = NULL;
++    size_t bitstream_size = 0;
++    int retval = 0;
++
++    for (int i = 0; i < tile_group_count; i++)
++        bitstream_size += tile_groups[i].data.sz;
++
++    uint32_t size_allocated = 0;
++    if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_BITSTREAM, &size_allocated, &bitstream_target)) || size_allocated < (uint32_t)bitstream_size)
++        return DAV1D_ERR(ENOMEM);
++
++    memset(bitstream_target, 0, size_allocated);
++
++    // this is a GPU bitstream upload 
++    for (int i = 0; i < tile_group_count; i++) {
++        const uint8_t *data = tile_groups[i].data.data;
++        size_t size = tile_groups[i].data.sz;
++        memcpy(bitstream_target, data, size);
++        bitstream_target += size;
++    }
++
++    // Note: this API is badly non-intuitive.  DX12's version (which actually uses the pointers provided above)
++    //       is quite a bit more sane.
++    D3D11_VIDEO_DECODER_BUFFER_DESC decodeDesc[3];
++    decodeDesc[0].BufferType = D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS;
++    decodeDesc[0].DataSize = sizeof(DXVA_PicParams_AV1);
++    decodeDesc[1].BufferType = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
++    decodeDesc[1].DataSize = sizeof(DXVA_Tile_AV1)*n_tiles;
++    decodeDesc[2].BufferType = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
++    decodeDesc[2].DataSize = (UINT)bitstream_size;
++
++    if (FAILED(ID3D11VideoContext_SubmitDecoderBuffers(dxva->vcontext, dxva->decoder, 3, decodeDesc)))
++        retval = DAV1D_ERR(1);
++
++    ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_BITSTREAM);
++    return 0;
++}
++
++int av1_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++    UINT bufferSize = sizeof(DXVA_PicParams_AV1);
++    Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++    uint32_t size = 0;
++
++    if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS, &size, pic)) || size < sizeof(DXVA_PicParams_AV1))
++        return DAV1D_ERR(ENOMEM);
++
++    memset(*pic, 0, size);
++
++    if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL, &size, tiles)) || size < sizeof(DXVA_Tile_AV1) * n_tiles)
++        return DAV1D_ERR(ENOMEM);
++
++    memset(*tiles, 0, size);
++    return 0;
++}
++
++void av1_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++    Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++    ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS);
++    ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL);
++
++    ID3D11VideoContext_DecoderEndFrame(dxva->vcontext, dxva->decoder);
++}
++
++int dxva_init(Dav1dSettings *settings, Dav1dDXVAInfo *dxva)
++{
++    memset(dxva, 0, sizeof(*dxva));
++    dxva->defaultAllocator = settings->allocator;
++    dxva->freeListRead = 0;
++    dxva->freeListWrite = 0;
++
++    if(FAILED(D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, D3D11_CREATE_DEVICE_VIDEO_SUPPORT, NULL, 0, D3D11_SDK_VERSION, &dxva->device, NULL, &dxva->context)))
++        return DAV1D_ERR(1);
++    if(FAILED(ID3D11Device_QueryInterface(dxva->device, &IID_ID3D11VideoDevice, &dxva->vdevice)))
++        return DAV1D_ERR(1);
++    if(FAILED(ID3D11DeviceContext_QueryInterface(dxva->context, &IID_ID3D11VideoContext, &dxva->vcontext)))
++        return DAV1D_ERR(1);
++
++    settings->dxva.cookie = (void*)dxva;
++    settings->dxva.decode_callback = av1_dxva_decode;
++    settings->dxva.alloc_callback = av1_dxva_alloc;
++    settings->dxva.release_callback = av1_dxva_release;
++    settings->allocator.cookie = (void*)dxva;
++
++    // Note that actual decoder creation is delayed until we have the first picture
++    //  this can be inefficient because decoder creation on the hardware can be slow
++    //  but it keeps this code simple.
++    // Time to first picture can be reduced if you know the decoder Profile ahead of time
++    //  and pre-create the DXVA decoder, for example from a container like MP4 etc...
++    return 0;
++}
++
++void dxva_shutdown(Dav1dDXVAInfo *dxva)
++{
++    if (dxva->device != NULL)
++        ID3D11Device_Release(dxva->device);
++    if (dxva->context != NULL)
++        ID3D11DeviceContext_Release(dxva->context);
++    if (dxva->vdevice != NULL)
++        ID3D11VideoDevice_Release(dxva->vdevice);
++    if (dxva->vcontext != NULL)
++        ID3D11VideoContext_Release(dxva->vcontext);
++    if (dxva->decoder != NULL)
++        ID3D11VideoDecoder_Release(dxva->decoder);
++    if (dxva->stage != NULL)
++        ID3D11Texture2D_Release(dxva->stage);
++    if (dxva->textures != NULL)
++        ID3D11Texture2D_Release(dxva->textures);
++    for(int i = 0; i < MAX_SAMPLE_POOL_SIZE; i++) {
++        if (dxva->views[i] != NULL)
++            ID3D11VideoDecoderOutputView_Release(dxva->views[i]);
++    }
++}
++
++int dxgi_texture_bitdepth(DXGI_FORMAT fmt) {
++    switch(fmt) {
++        case DXGI_FORMAT_R16_UNORM:
++        case DXGI_FORMAT_P016:
++        case DXGI_FORMAT_Y216:
++        case DXGI_FORMAT_Y416:
++            return 16;
++        case DXGI_FORMAT_R8_UNORM:
++        case DXGI_FORMAT_NV12:
++        case DXGI_FORMAT_YUY2:
++        case DXGI_FORMAT_AYUV:
++            return 8;
++    }
++    return 0;
++}
++
++int dxgi_texture_is_2plane(DXGI_FORMAT fmt) {
++    return fmt == DXGI_FORMAT_P016 || fmt == DXGI_FORMAT_NV12;
++}
++
++int layout_to_subsampleH(enum Dav1dPixelLayout layout) {
++    switch(layout){
++        case DAV1D_PIXEL_LAYOUT_I420:
++        case DAV1D_PIXEL_LAYOUT_I422:
++            return 1;
++    }
++    return 0;
++}
++
++int layout_to_subsampleW(enum Dav1dPixelLayout layout) {
++    return layout == DAV1D_PIXEL_LAYOUT_I420;
++}
++
++int dxva_lock(Dav1dPicture *p, Dav1dDXVAInfo *dxva)
++{
++    D3D11_MAPPED_SUBRESOURCE mapped;
++    ID3D11DeviceContext_CopySubresourceRegion(dxva->context, (ID3D11Resource*)dxva->stage, 0, 0, 0, 0, (ID3D11Resource*)dxva->textures, p->dxva_picture_index, NULL);
++    if(FAILED(ID3D11DeviceContext_Map(dxva->context, (ID3D11Resource*)dxva->stage, 0, D3D11_MAP_READ, 0, &mapped)))
++        return DAV1D_ERR(1);
++
++    if(dxva->defaultAllocator.alloc_picture_callback(p, dxva->defaultAllocator.cookie))
++        return DAV1D_ERR(ENOMEM);
++
++    // This is not an optimal surface format conversion, it is just as simple as possible
++    int texture_bitdepth = dxgi_texture_bitdepth(dxva->currentTextureDesc.Format);
++    uint8_t *outputY = p->data[0];
++    uint8_t *outputU = p->data[1];
++    uint8_t *outputV = p->data[2];
++    uint8_t *inputY = mapped.pData;
++    uint8_t *inputU = inputY + (dxgi_texture_is_2plane(dxva->currentTextureDesc.Format) ? dxva->currentTextureDesc.Height * mapped.RowPitch : 0);
++    uint8_t *inputV = inputU;
++    uint8_t bpmp_out = p->p.bpc > 8 ? 2 : 1;
++    uint8_t bpmp_in = texture_bitdepth > 8 ? 2 : 1;
++    if(dxva->currentTextureDesc.Format == DXGI_FORMAT_YUY2 || dxva->currentTextureDesc.Format == DXGI_FORMAT_Y216)
++    {
++        inputU += bpmp_in;
++        inputV += bpmp_in * 3;
++        bpmp_in = 2;
++    }
++
++    if(dxva->currentTextureDesc.Format == DXGI_FORMAT_AYUV || dxva->currentTextureDesc.Format == DXGI_FORMAT_Y416)
++    {
++        inputY += bpmp_in * 2;
++        inputU += bpmp_in;
++        inputV += bpmp_in * 3;
++        bpmp_in *= 4;
++    }
++
++    for(int y = 0; y < p->p.h; y++) {
++        for(int x = 0; x < p->p.w; x++) {
++            outputY[x*bpmp_out] = inputY[x*bpmp_in];
++        }
++        outputY += p->stride[0];
++        inputY += mapped.RowPitch;
++    }
++    if(outputU && outputV)
++    {
++        int subsampleH = layout_to_subsampleH(p->p.layout);
++        int subsampleW = layout_to_subsampleW(p->p.layout);
++        for(int y = 0; y < p->p.h >> subsampleH; y++) {
++            for(int x = 0; x < p->p.w >> subsampleW; x++) {
++                outputU[x*bpmp_out] = inputU[x*bpmp_in];
++                outputV[x*bpmp_out] = inputV[x*bpmp_in];
++            }
++            outputU += p->stride[1];
++            outputV += p->stride[1];
++            inputU += mapped.RowPitch;
++            inputV += mapped.RowPitch;
++        }
++    }
++    return 0;
++}
++
++#endif // _WIN32
+diff --git a/tools/dav1d_cli_dxva.h b/tools/dav1d_cli_dxva.h
+new file mode 100644
+index 0000000..a33ad5b
+--- /dev/null
++++ b/tools/dav1d_cli_dxva.h
+@@ -0,0 +1,63 @@
++/*
++ * Copyright © 2018, VideoLAN and dav1d authors
++ * Copyright © 2018, Two Orioles, LLC
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ *    list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ *    this list of conditions and the following disclaimer in the documentation
++ *    and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef DAV1D_CLI_DXVA_H
++#define DAV1D_CLI_DXVA_H
++
++#include "dav1d/dav1d.h"
++#include <windows.h>
++#include <d3d11.h>
++
++#define MAX_SAMPLE_POOL_SIZE 10
++
++typedef struct {
++    ID3D11Device *device;
++    ID3D11DeviceContext *context;
++    ID3D11VideoDevice *vdevice;
++    ID3D11VideoContext *vcontext;
++    ID3D11Texture2D *textures;
++    ID3D11VideoDecoder *decoder;
++    ID3D11VideoDecoderOutputView *views[MAX_SAMPLE_POOL_SIZE];
++    ID3D11Texture2D *stage;
++    D3D11_VIDEO_DECODER_DESC currentDecoderDesc;
++    D3D11_TEXTURE2D_DESC currentTextureDesc;
++    Dav1dPicAllocator defaultAllocator;
++
++    // Samples not currently used for decode
++    uint16_t freeList[MAX_SAMPLE_POOL_SIZE];
++    uint16_t freeListRead;
++    uint16_t freeListWrite;
++} Dav1dDXVAInfo;
++
++// Configures the decoder to use DXVA
++int dxva_init(Dav1dSettings *settings, Dav1dDXVAInfo *dxva);
++
++// Maps the DXVA texture associated with this picture into CPU
++//  memory and points the Dav1dPicture's buffer at it
++int dxva_lock(Dav1dPicture *p, Dav1dDXVAInfo *dxva);
++
++#endif // DAV1D_CLI_DXVA_H
+diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
+index f363033..fa357cc 100644
+--- a/tools/dav1d_cli_parse.c
++++ b/tools/dav1d_cli_parse.c
+@@ -57,6 +57,7 @@ enum {
+     ARG_ALL_LAYERS,
+     ARG_SIZE_LIMIT,
+     ARG_CPU_MASK,
++    ARG_DXVA,
+ };
+ 
+ static const struct option long_opts[] = {
+@@ -79,6 +80,7 @@ static const struct option long_opts[] = {
+     { "alllayers",      1, NULL, ARG_ALL_LAYERS },
+     { "sizelimit",      1, NULL, ARG_SIZE_LIMIT },
+     { "cpumask",        1, NULL, ARG_CPU_MASK },
++    { "dxva",           0, NULL, ARG_DXVA },
+     { NULL,             0, NULL, 0 },
+ };
+ 
+@@ -122,7 +124,11 @@ static void usage(const char *const app, const char *const reason, ...) {
+             " --alllayers $num:     output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
+             " --sizelimit $num:     stop decoding if the frame size exceeds the specified limit\n"
+             " --verify $md5:        verify decoded md5. implies --muxer md5, no output\n"
+-            " --cpumask $mask:      restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n");
++            " --cpumask $mask:      restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n"
++#ifdef _WIN32
++            " --dxva                use DXVA accelerated decode (Win32 only)\n"
++#endif
++        );
+     exit(1);
+ }
+ 
+@@ -321,6 +327,10 @@ void parse(const int argc, char *const *const argv,
+             lib_settings->frame_size_limit = (unsigned) res;
+             break;
+         }
++        case ARG_DXVA: {
++            cli_settings->dxva = 1;
++            break;
++        }
+         case 'v':
+             fprintf(stderr, "%s\n", dav1d_version());
+             exit(0);
+diff --git a/tools/dav1d_cli_parse.h b/tools/dav1d_cli_parse.h
+index 11e88e1..036a5d7 100644
+--- a/tools/dav1d_cli_parse.h
++++ b/tools/dav1d_cli_parse.h
+@@ -46,6 +46,7 @@ typedef struct {
+     } realtime;
+     double realtime_fps;
+     unsigned realtime_cache;
++    int dxva;
+ } CLISettings;
+ 
+ void parse(const int argc, char *const *const argv,
+diff --git a/tools/meson.build b/tools/meson.build
+index 4b4217a..7592983 100644
+--- a/tools/meson.build
++++ b/tools/meson.build
+@@ -77,12 +77,18 @@ dav1d_sources = files(
+     'dav1d_cli_parse.c',
+ )
+ 
++if host_machine.system() == 'windows'
++    dav1d_sources += files(
++        'dav1d_cli_dxva.c',
++    )
++endif
++
+ dav1d = executable('dav1d',
+     dav1d_sources,
+     rev_target, cli_config_h_target,
+ 
+     link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
+     include_directories : [dav1d_inc_dirs],
+-    dependencies : [getopt_dependency, thread_dependency, rt_dependency],
++    dependencies : [getopt_dependency, thread_dependency, rt_dependency, d3d11_dependency],
+     install : true,
+ )
+-- 
+2.27.0.windows.1
+
diff --git a/contrib/src/dav1d/rules.mak b/contrib/src/dav1d/rules.mak
index fe0e222b166..5137f138df5 100644
--- a/contrib/src/dav1d/rules.mak
+++ b/contrib/src/dav1d/rules.mak
@@ -19,6 +19,7 @@ $(TARBALLS)/dav1d-$(DAV1D_VERSION).tar.xz:
 dav1d: dav1d-$(DAV1D_VERSION).tar.xz .sum-dav1d
 	$(UNPACK)
 	$(APPLY) $(SRC)/dav1d/0001-SSE2-PIC-464ca6c2.patch
+	$(APPLY) $(SRC)/dav1d/0001-Add-DXVA-support.patch
 	$(MOVE)
 
 .dav1d: dav1d crossfile.meson
-- 
2.26.2