[vlc-devel] [PATCH 1/2] contrib: dav1d: add DXVA support
Steve Lhomme
robux4 at ycbcr.xyz
Thu Sep 3 15:44:03 CEST 2020
Based on (unmerged) code from Matthew Wozniak
https://code.videolan.org/mwozniak/dav1d/-/tree/dxva
---
contrib/src/dav1d/0001-Add-DXVA-support.patch | 1800 +++++++++++++++++
contrib/src/dav1d/rules.mak | 1 +
2 files changed, 1801 insertions(+)
create mode 100644 contrib/src/dav1d/0001-Add-DXVA-support.patch
diff --git a/contrib/src/dav1d/0001-Add-DXVA-support.patch b/contrib/src/dav1d/0001-Add-DXVA-support.patch
new file mode 100644
index 00000000000..91b1f6b2eaf
--- /dev/null
+++ b/contrib/src/dav1d/0001-Add-DXVA-support.patch
@@ -0,0 +1,1800 @@
+From 0117cce8907432fe673ab511bf11ed81e8c81b2a Mon Sep 17 00:00:00 2001
+From: Matt Wozniak <mwozniak at microsoft.com>
+Date: Tue, 16 Jun 2020 20:39:12 +0200
+Subject: [PATCH] Add DXVA support A new set of callbacks have been added which
+ allow an application to have dAV1d generate AV1 DXVA structures. The
+ application can then take these generated structures and use them to call
+ DXVA APIs (in DX9, DX11, or DX12). When using DXVA to decode dAV1d software
+ based decoding is disabled and replaced with the DXVA callback functions.
+
+There are 5 new callbacks:
+ * decode_callback - called to perform DXVA decode (using SubmitBuffers)
+ * alloc_callback / release_callback - for allocating and releasing AV1 DXVA picture structs
+ * alloc_bitstream_callback / release_bitstream_callback - used for allocating and releasing
+ GPU backed memory for bitstream data.
+
+ These callbacks are designed to be compatible with all 3 variants of DXVA decoding: DX9, DX11 and DX12.
+ Currently when DXVA decode is enabled (by setting up the DXVA callback structure), the following decoder behaviours are changed:
+ * decoder always operates in single threaded mode (multi-threaded operation only works for DX12)
+ * decoder will skip software decode (dav1d_decode_frame)
+
+ Included is also an update to dav1d.exe cli application to include support for DXVA decode on Windows platforms.
+ This is an example of how to implement a complete DXVA decoding solution using the new callbacks.
+
+ This change has been validated to work with AV1 DXVA hardware.
+---
+ include/dav1d/dav1d.h | 71 +++++++
+ include/dav1d/dxva_av1.h | 302 ++++++++++++++++++++++++++
+ include/dav1d/meson.build | 20 +-
+ include/dav1d/picture.h | 7 +
+ meson.build | 6 +-
+ src/decode.c | 27 ++-
+ src/dxva.c | 432 ++++++++++++++++++++++++++++++++++++++
+ src/dxva.h | 44 ++++
+ src/internal.h | 8 +-
+ src/lib.c | 19 +-
+ src/meson.build | 6 +
+ tools/dav1d.c | 23 ++
+ tools/dav1d_cli_dxva.c | 403 +++++++++++++++++++++++++++++++++++
+ tools/dav1d_cli_dxva.h | 63 ++++++
+ tools/dav1d_cli_parse.c | 12 +-
+ tools/dav1d_cli_parse.h | 1 +
+ tools/meson.build | 8 +-
+ 17 files changed, 1436 insertions(+), 16 deletions(-)
+ create mode 100644 include/dav1d/dxva_av1.h
+ create mode 100644 src/dxva.c
+ create mode 100644 src/dxva.h
+ create mode 100644 tools/dav1d_cli_dxva.c
+ create mode 100644 tools/dav1d_cli_dxva.h
+
+diff --git a/include/dav1d/dav1d.h b/include/dav1d/dav1d.h
+index 32fe8c3..1143c1f 100644
+--- a/include/dav1d/dav1d.h
++++ b/include/dav1d/dav1d.h
+@@ -43,9 +43,25 @@ extern "C" {
+ typedef struct Dav1dContext Dav1dContext;
+ typedef struct Dav1dRef Dav1dRef;
+
++typedef struct _DXVA_PicParams_AV1 DXVA_PicParams_AV1;
++typedef struct _DXVA_Tile_AV1 DXVA_Tile_AV1;
++
+ #define DAV1D_MAX_FRAME_THREADS 256
+ #define DAV1D_MAX_TILE_THREADS 64
+
++typedef struct Dav1dTileGroup {
++
++ /**
++ * Bitstream data representing the tiles in this group.
++ */
++ Dav1dData data;
++
++ /**
++ * Start and end tile indexes represented by this tile group.
++ */
++ int start, end;
++} Dav1dTileGroup;
++
+ typedef struct Dav1dLogger {
+ void *cookie; ///< Custom data to pass to the callback.
+ /**
+@@ -58,6 +74,60 @@ typedef struct Dav1dLogger {
+ void (*callback)(void *cookie, const char *format, va_list ap);
+ } Dav1dLogger;
+
++typedef struct Dav1dDXVA {
++ void *cookie; ///< Custom data to pass to the callback
++ /**
++ * DXVA Decoding callback. Called to generate a picture using DXVA.
++ *
++ * @param cookie Custom pointer passed to all calls.
++ * @param picture_parameters Filled DXVA picture parameters struct.
++ * @param tiles Array of tile information.
++ * @param n_tiles Number of tiles in array.
++ * @param output_picture Picture to decode into.
++ * @param tile_groups Array of bitstream data
++ * @param tile_group_count Count of bitstream data entries
++ *
++ * @note By setting this callback the decoder will no longer perform
++ * any software decoding processes. And will operate in single
++ * threaded mode, no matter the setting of n_frame_threads.
++ *
++ * Tiles data structure is filled assuming bitstream data is copied into
++ * one contiguous buffer in-order from the tile_groups array.
++ *
++ * @return 0 on success, or < 0 (a negative DAV1D_ERR code) on error.
++ */
++ int(*decode_callback)(void *cookie, DXVA_PicParams_AV1 *picture_parameters, DXVA_Tile_AV1 *tiles, const int n_tiles, Dav1dPicture *output_picture, Dav1dTileGroup *tile_groups, int tile_group_count);
++
++ /**
++ * Allocate DXVA picture parameters buffer, and tile array.
++ *
++ * @param cookie Custom pointer passed to all calls.
++ * @param picture Picture to allocate DXVA buffers for.
++ * @param picparams Pointer to store the allocated buffer in.
++ * @param tiles Pointer to store tile array in.
++ * @param n_tiles Size of tile array to allocate.
++ *
++ * @note When using DX11 the driver should allocate this
++ * buffer and this field should be set. When using
++ * DX12 it is safe to leave this as the default.
++ *
++ * @return 0 on success. A negative DAV1D_ERR value on error.
++ */
++ int(*alloc_callback)(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **picparams, DXVA_Tile_AV1 **tiles, int n_tiles);
++
++ /**
++ * Release the DXVA picture parameters buffer.
++ *
++ * @param cookie Custom pointer passed to all calls.
++ * @param picparams The picture params buffer that was
++ * allocated by alloc_callback().
++ * @param tiles The tiles array that was allocated
++ * by alloc_callback().
++ */
++ void(*release_callback)(void *cookie, DXVA_PicParams_AV1 *picparams, DXVA_Tile_AV1 *tiles);
++
++} Dav1dDXVA;
++
+ typedef struct Dav1dSettings {
+ int n_frame_threads;
+ int n_tile_threads;
+@@ -68,6 +138,7 @@ typedef struct Dav1dSettings {
+ uint8_t reserved[32]; ///< reserved for future use
+ Dav1dPicAllocator allocator; ///< Picture allocator callback.
+ Dav1dLogger logger; ///< Logger callback.
++ Dav1dDXVA dxva; ///< DXVA callbacks
+ } Dav1dSettings;
+
+ /**
+diff --git a/include/dav1d/dxva_av1.h b/include/dav1d/dxva_av1.h
+new file mode 100644
+index 0000000..0a37c41
+--- /dev/null
++++ b/include/dav1d/dxva_av1.h
+@@ -0,0 +1,302 @@
++//------------------------------------------------------------------------------
++// File: DXVA_AV1.h
++//
++// Desc: DirectX Video Acceleration header file.
++// This file is a copied excerpt of the Win32 API, defining DXVA
++// structures necessary for decode of AV1 bitstreams.
++//
++// Copyright (c) 2020, Microsoft Corporation.
++//------------------------------------------------------------------------------
++
++#ifndef _DIRECTX_AV1_VA_
++#define _DIRECTX_AV1_VA_
++
++#if defined(_WIN32)
++
++#include <windows.h>
++#include <dxva.h>
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++/* AV1 picture entry data structure */
++typedef struct _DXVA_PicEntry_AV1 {
++
++ UINT width;
++ UINT height;
++
++ // Global motion parameters
++ INT wmmat[6];
++ union {
++ struct {
++ UCHAR wminvalid : 1;
++ UCHAR wmtype : 2;
++ UCHAR Reserved : 5;
++ };
++ UCHAR GlobalMotionFlags;
++ } DUMMYUNIONNAME;
++
++ UCHAR Index;
++ UINT16 Reserved16Bits;
++
++} DXVA_PicEntry_AV1, * LPDXVA_PicEntry_AV1;
++
++/* AV1 picture parameters structure */
++typedef struct _DXVA_PicParams_AV1 {
++ UINT width;
++ UINT height;
++
++ UINT max_width;
++ UINT max_height;
++
++ UCHAR CurrPicTextureIndex;
++ UCHAR superres_denom;
++ UCHAR bitdepth;
++ UCHAR seq_profile;
++
++ // Tiles:
++ struct {
++ UCHAR cols;
++ UCHAR rows;
++ USHORT context_update_id;
++ USHORT widths[64];
++ USHORT heights[64];
++ } tiles;
++
++ // Coding Tools
++ union {
++ struct {
++ UINT use_128x128_superblock : 1;
++ UINT intra_edge_filter : 1;
++ UINT interintra_compound : 1;
++ UINT masked_compound : 1;
++ UINT warped_motion : 1;
++ UINT dual_filter : 1;
++ UINT jnt_comp : 1;
++ UINT screen_content_tools : 1;
++ UINT integer_mv : 1;
++ UINT cdef : 1;
++ UINT restoration : 1;
++ UINT film_grain : 1;
++ UINT intrabc : 1;
++ UINT high_precision_mv : 1;
++ UINT switchable_motion_mode : 1;
++ UINT filter_intra : 1;
++ UINT disable_frame_end_update_cdf : 1;
++ UINT disable_cdf_update : 1;
++ UINT reference_mode : 1;
++ UINT skip_mode : 1;
++ UINT reduced_tx_set : 1;
++ UINT superres : 1;
++ UINT tx_mode : 2;
++ UINT use_ref_frame_mvs : 1;
++ UINT enable_ref_frame_mvs : 1;
++ UINT reference_frame_update : 1;
++ UINT Reserved : 5;
++ };
++ UINT32 CodingParamToolFlags;
++ } coding;
++
++ // Format & Picture Info flags
++ union {
++ struct {
++ UCHAR frame_type : 2;
++ UCHAR show_frame : 1;
++ UCHAR showable_frame : 1;
++ UCHAR subsampling_x : 1;
++ UCHAR subsampling_y : 1;
++ UCHAR mono_chrome : 1;
++ UCHAR Reserved : 1;
++ };
++ UCHAR FormatAndPictureInfoFlags;
++ } format;
++
++ // References
++ UCHAR primary_ref_frame;
++ UCHAR order_hint;
++ UCHAR order_hint_bits;
++
++ DXVA_PicEntry_AV1 frame_refs[7];
++ UCHAR RefFrameMapTextureIndex[8];
++
++ // Loop filter parameters
++ struct {
++ UCHAR filter_level[2];
++ UCHAR filter_level_u;
++ UCHAR filter_level_v;
++
++ UCHAR sharpness_level;
++ union {
++ struct {
++ UCHAR mode_ref_delta_enabled : 1;
++ UCHAR mode_ref_delta_update : 1;
++ UCHAR delta_lf_multi : 1;
++ UCHAR delta_lf_present : 1;
++ UCHAR Reserved : 4;
++ };
++ UCHAR ControlFlags;
++ } DUMMYUNIONNAME;
++ CHAR ref_deltas[8];
++ CHAR mode_deltas[2];
++ UCHAR delta_lf_res;
++ UCHAR frame_restoration_type[3];
++ USHORT log2_restoration_unit_size[3];
++ UINT16 Reserved16Bits;
++ } loop_filter;
++
++ // Quantization
++ struct {
++ union {
++ struct {
++ UCHAR delta_q_present : 1;
++ UCHAR delta_q_res : 2;
++ UCHAR Reserved : 5;
++ };
++ UCHAR ControlFlags;
++ } DUMMYUNIONNAME;
++
++ UCHAR base_qindex;
++ CHAR y_dc_delta_q;
++ CHAR u_dc_delta_q;
++ CHAR v_dc_delta_q;
++ CHAR u_ac_delta_q;
++ CHAR v_ac_delta_q;
++ // using_qmatrix:
++ UCHAR qm_y;
++ UCHAR qm_u;
++ UCHAR qm_v;
++ UINT16 Reserved16Bits;
++ } quantization;
++
++ // Cdef parameters
++ struct {
++ union {
++ struct {
++ UCHAR damping : 2;
++ UCHAR bits : 2;
++ UCHAR Reserved : 4;
++ };
++ UCHAR ControlFlags;
++ } DUMMYUNIONNAME;
++
++ union {
++ struct {
++ UCHAR primary : 6;
++ UCHAR secondary : 2;
++ };
++ UCHAR combined;
++ } y_strengths[8];
++
++ union {
++ struct {
++ UCHAR primary : 6;
++ UCHAR secondary : 2;
++ };
++ UCHAR combined;
++ } uv_strengths[8];
++
++ } cdef;
++
++ UCHAR interp_filter;
++
++ // Segmentation
++ struct {
++ union {
++ struct {
++ UCHAR enabled : 1;
++ UCHAR update_map : 1;
++ UCHAR update_data : 1;
++ UCHAR temporal_update : 1;
++ UCHAR Reserved : 4;
++ };
++ UCHAR ControlFlags;
++ } DUMMYUNIONNAME;
++ UCHAR Reserved24Bits[3];
++
++ union {
++ struct {
++ UCHAR alt_q : 1;
++ UCHAR alt_lf_y_v : 1;
++ UCHAR alt_lf_y_h : 1;
++ UCHAR alt_lf_u : 1;
++ UCHAR alt_lf_v : 1;
++ UCHAR ref_frame : 1;
++ UCHAR skip : 1;
++ UCHAR globalmv : 1;
++ };
++ UCHAR mask;
++ } feature_mask[8];
++
++ SHORT feature_data[8][8];
++
++ } segmentation;
++
++ struct {
++ union {
++ struct {
++ USHORT apply_grain : 1;
++ USHORT scaling_shift_minus8 : 2;
++ USHORT chroma_scaling_from_luma : 1;
++ USHORT ar_coeff_lag : 2;
++ USHORT ar_coeff_shift_minus6 : 2;
++ USHORT grain_scale_shift : 2;
++ USHORT overlap_flag : 1;
++ USHORT clip_to_restricted_range : 1;
++ USHORT matrix_coeff_is_identity : 1;
++ USHORT Reserved : 3;
++ };
++ USHORT ControlFlags;
++ } DUMMYUNIONNAME;
++
++ USHORT grain_seed;
++ UCHAR scaling_points_y[14][2];
++ UCHAR num_y_points;
++ UCHAR scaling_points_cb[10][2];
++ UCHAR num_cb_points;
++ UCHAR scaling_points_cr[10][2];
++ UCHAR num_cr_points;
++ UCHAR ar_coeffs_y[24];
++ UCHAR ar_coeffs_cb[25];
++ UCHAR ar_coeffs_cr[25];
++ UCHAR cb_mult;
++ UCHAR cb_luma_mult;
++ UCHAR cr_mult;
++ UCHAR cr_luma_mult;
++ UCHAR Reserved8Bits;
++ SHORT cb_offset;
++ SHORT cr_offset;
++ } film_grain;
++
++ UINT Reserved32Bits;
++ UINT StatusReportFeedbackNumber;
++} DXVA_PicParams_AV1, * LPDXVA_PicParams_AV1;
++
++/* AV1 tile structure */
++typedef struct _DXVA_Tile_AV1 {
++ UINT DataOffset;
++ UINT DataSize;
++ USHORT row;
++ USHORT column;
++ UINT16 Reserved16Bits;
++ UCHAR anchor_frame;
++ UCHAR Reserved8Bits;
++} DXVA_Tile_AV1, * LPDXVA_Tile_AV1;
++
++/* AV1 status reporting data structure */
++typedef struct _DXVA_Status_AV1 {
++ UINT StatusReportFeedbackNumber;
++ DXVA_PicEntry_AV1 CurrPic;
++ UCHAR BufType;
++ UCHAR Status;
++ UCHAR Reserved8Bits;
++ USHORT NumMbsAffected;
++} DXVA_Status_AV1, * LPDXVA_Status_AV1;
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif // defined(_WIN32)
++
++#endif // _DIRECTX_AV1_VA_
+diff --git a/include/dav1d/meson.build b/include/dav1d/meson.build
+index b5649d3..275c8a2 100644
+--- a/include/dav1d/meson.build
++++ b/include/dav1d/meson.build
+@@ -31,11 +31,21 @@ version_h_target = configure_file(input: 'version.h.in',
+ output: 'version.h',
+ configuration: version_h_data)
+
++dav1d_api_headers = files(
++ 'common.h',
++ 'data.h',
++ 'dav1d.h',
++ 'headers.h',
++ 'picture.h',
++ )
++
++if host_machine.system() == 'windows'
++ dav1d_api_headers += files(
++ 'dxva_av1.h',
++ )
++endif
++
+ # install headers
+-install_headers('common.h',
+- 'data.h',
+- 'dav1d.h',
+- 'headers.h',
+- 'picture.h',
++install_headers(dav1d_api_headers,
+ version_h_target,
+ subdir : 'dav1d')
+diff --git a/include/dav1d/picture.h b/include/dav1d/picture.h
+index 98e5eb5..3e75607 100644
+--- a/include/dav1d/picture.h
++++ b/include/dav1d/picture.h
+@@ -93,6 +93,13 @@ typedef struct Dav1dPicture {
+ struct Dav1dRef *ref; ///< Frame data allocation origin
+
+ void *allocator_data; ///< pointer managed by the allocator
++
++ /**
++ * Used to manage DXVA texture array index when decoding with DXVA,
++ * the allocator should put an index here that will be used to fill out
++ * DXVA_PicParams_AV1::ref_frame_map_texture_index
++ */
++ uint16_t dxva_picture_index;
+ } Dav1dPicture;
+
+ typedef struct Dav1dPicAllocator {
+diff --git a/meson.build b/meson.build
+index d5366f9..defda53 100644
+--- a/meson.build
++++ b/meson.build
+@@ -30,7 +30,7 @@ project('dav1d', ['c'],
+ 'b_ndebug=if-release'],
+ meson_version: '>= 0.47.0')
+
+-dav1d_soname_version = '4.0.2'
++dav1d_soname_version = '5.0.0'
+ dav1d_api_version_array = dav1d_soname_version.split('.')
+ dav1d_api_version_major = dav1d_api_version_array[0]
+ dav1d_api_version_minor = dav1d_api_version_array[1]
+@@ -156,6 +156,10 @@ if host_machine.system() == 'linux'
+ endif
+ endif
+
++d3d11_dependency = []
++if host_machine.system() == 'windows'
++ d3d11_dependency = cc.find_library('d3d11', required: true)
++endif
+
+ # Header checks
+
+diff --git a/src/decode.c b/src/decode.c
+index f678215..345f23e 100644
+--- a/src/decode.c
++++ b/src/decode.c
+@@ -50,6 +50,7 @@
+ #include "src/tables.h"
+ #include "src/thread_task.h"
+ #include "src/warpmv.h"
++#include "src/dxva.h"
+
+ static void init_quant_tables(const Dav1dSequenceHeader *const seq_hdr,
+ const Dav1dFrameHeader *const frame_hdr,
+@@ -3441,6 +3442,15 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+ f->resize_start[1] = get_upscale_x0(in_cw, out_cw, f->resize_step[1]);
+ }
+
++#ifdef _WIN32
++ // allocate and fill DXVA structures if using DXVA
++ if (c->dxva.decode_callback)
++ {
++ res = dav1d_allocate_frame_dxva(f, c);
++ if (res < 0) goto error;
++ }
++#endif // _WIN32
++
+ // move f->cur into output queue
+ if (c->n_fc == 1) {
+ if (f->frame_hdr->show_frame)
+@@ -3585,8 +3595,21 @@ int dav1d_submit_frame(Dav1dContext *const c) {
+ }
+ }
+
+- if (c->n_fc == 1) {
+- if ((res = dav1d_decode_frame(f)) < 0) {
++ // FIXME currently DXVA based decode is always single-threaded
++ // In DX12 it is possible to schedule DXVA decode multi-threaded.
++ // This is not relevant to DX11/DX9 as those only support single-threaded operation.
++ if (c->dxva.decode_callback || c->n_fc == 1) {
++#ifdef _WIN32
++ if (c->dxva.decode_callback)
++ {
++ res = dav1d_decode_frame_dxva(f);
++ }
++ else
++#endif // _WIN32
++ {
++ res = dav1d_decode_frame(f);
++ }
++ if (res < 0) {
+ dav1d_picture_unref_internal(&c->out);
+ for (int i = 0; i < 8; i++) {
+ if (refresh_frame_flags & (1 << i)) {
+diff --git a/src/dxva.c b/src/dxva.c
+new file mode 100644
+index 0000000..4d39fc9
+--- /dev/null
++++ b/src/dxva.c
+@@ -0,0 +1,432 @@
++/*
++ * Copyright � 2020, VideoLAN and dav1d authors
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ * list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include "config.h"
++
++#include <errno.h>
++#include <stdint.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++
++#include "dav1d/headers.h"
++#include "common/intops.h"
++#include "common/mem.h"
++#include "common/validate.h"
++
++#if defined(_WIN32)
++
++#include "src/internal.h"
++#include "src/log.h"
++#include "src/picture.h"
++#include "src/ref.h"
++#include "src/dxva.h"
++#include "dav1d/dxva_av1.h"
++
++#define DXVA_INVALID_PICTURE_INDEX 0xFFu
++#define DXVA_INVALID_QM 0xFFu
++
++static int dav1d_restoration_type_to_dxva(const enum Dav1dRestorationType type) {
++ switch (type) {
++ case DAV1D_RESTORATION_NONE: return 0;
++ case DAV1D_RESTORATION_SWITCHABLE: return 3;
++ case DAV1D_RESTORATION_WIENER: return 1;
++ case DAV1D_RESTORATION_SGRPROJ: return 2;
++ }
++ return 0;
++}
++
++static int fill_picparams_struct(DXVA_PicParams_AV1* params, const Dav1dFrameContext *fc, Dav1dContext *const c) {
++ params->CurrPicTextureIndex = (unsigned char)fc->cur.dxva_picture_index;
++
++ // Basics
++ const int bitdepthTranslation[] = { 8,10,12 };
++ params->width = fc->frame_hdr->width[0];
++ params->height = fc->frame_hdr->height;
++ params->max_width = fc->seq_hdr->max_width;
++ params->max_height = fc->seq_hdr->max_height;
++
++ params->superres_denom = fc->frame_hdr->super_res.width_scale_denominator;
++
++ if (fc->seq_hdr->hbd > 2)
++ return DAV1D_ERR(EINVAL);
++
++ params->bitdepth = bitdepthTranslation[fc->seq_hdr->hbd];
++ params->seq_profile = fc->seq_hdr->profile;
++ params->interp_filter = (int)fc->frame_hdr->subpel_filter_mode; // enum same as dxva
++
++ // Tiles
++ params->tiles.cols = fc->frame_hdr->tiling.cols;
++ params->tiles.rows = fc->frame_hdr->tiling.rows;
++ params->tiles.context_update_id = fc->frame_hdr->tiling.update;
++
++ // AV1 DXVA defines tiles in terms of tile width / height in SBs.
++ // dAV1d specifies the start offset of each tile
++ uint16_t last = fc->frame_hdr->tiling.col_start_sb[0];
++ for (int w = 1; w <= fc->frame_hdr->tiling.cols; w++) {
++ params->tiles.widths[w-1] = fc->frame_hdr->tiling.col_start_sb[w] - last;
++ last = fc->frame_hdr->tiling.col_start_sb[w];
++ }
++ last = fc->frame_hdr->tiling.row_start_sb[0];
++ for (int h = 1; h <= fc->frame_hdr->tiling.rows; h++) {
++ params->tiles.heights[h-1] = fc->frame_hdr->tiling.row_start_sb[h] - last;
++ last = fc->frame_hdr->tiling.row_start_sb[h];
++ }
++
++ // Coding Tools
++ params->coding.use_128x128_superblock = fc->seq_hdr->sb128;
++ params->coding.intra_edge_filter = fc->seq_hdr->intra_edge_filter;
++ params->coding.interintra_compound = fc->seq_hdr->inter_intra;
++ params->coding.masked_compound = fc->seq_hdr->masked_compound;
++ params->coding.warped_motion = fc->frame_hdr->warp_motion;
++ params->coding.dual_filter = fc->seq_hdr->dual_filter;
++ params->coding.jnt_comp = fc->seq_hdr->jnt_comp;
++ params->coding.screen_content_tools = fc->frame_hdr->allow_screen_content_tools;
++ params->coding.integer_mv = fc->frame_hdr->force_integer_mv;
++ params->coding.cdef = fc->seq_hdr->cdef;
++ params->coding.restoration = fc->seq_hdr->restoration;
++ params->coding.film_grain = fc->seq_hdr->film_grain_present;
++ params->coding.intrabc = fc->frame_hdr->allow_intrabc;
++ params->coding.high_precision_mv = fc->frame_hdr->hp;
++ params->coding.switchable_motion_mode = fc->frame_hdr->switchable_motion_mode;
++ params->coding.filter_intra = fc->seq_hdr->filter_intra;
++ params->coding.disable_frame_end_update_cdf = !fc->frame_hdr->refresh_context;
++ params->coding.disable_cdf_update = fc->frame_hdr->disable_cdf_update;
++ params->coding.reference_mode = fc->frame_hdr->switchable_comp_refs;
++ params->coding.skip_mode = fc->frame_hdr->skip_mode_enabled;
++ params->coding.reduced_tx_set = fc->frame_hdr->reduced_txtp_set;
++ params->coding.superres = fc->frame_hdr->super_res.enabled;
++ params->coding.tx_mode = (int)fc->frame_hdr->txfm_mode; // enum same as dxva
++ params->coding.use_ref_frame_mvs = fc->frame_hdr->use_ref_frame_mvs;
++ params->coding.enable_ref_frame_mvs = fc->seq_hdr->ref_frame_mvs;
++ params->coding.reference_frame_update = !(fc->frame_hdr->show_existing_frame == 1 && fc->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY);
++
++ // Format & Picture Info flags
++ params->format.frame_type = (int)fc->frame_hdr->frame_type; // enum is same as dxva
++ params->format.show_frame = fc->frame_hdr->show_frame;
++ params->format.showable_frame = fc->frame_hdr->showable_frame;
++ switch (fc->cur.p.layout) {
++ case DAV1D_PIXEL_LAYOUT_I400:
++ case DAV1D_PIXEL_LAYOUT_I420:
++ params->format.subsampling_x = 1;
++ params->format.subsampling_y = 1;
++ break;
++ case DAV1D_PIXEL_LAYOUT_I422:
++ params->format.subsampling_x = 1;
++ params->format.subsampling_y = 0;
++ break;
++ case DAV1D_PIXEL_LAYOUT_I444:
++ params->format.subsampling_x = 0;
++ params->format.subsampling_y = 0;
++ break;
++ default:
++ return DAV1D_ERR(EINVAL);
++ }
++ params->format.mono_chrome = fc->seq_hdr->monochrome;
++
++ // References
++ params->order_hint = fc->frame_hdr->frame_offset;
++ params->order_hint_bits = fc->seq_hdr->order_hint_n_bits;
++
++ params->primary_ref_frame = fc->frame_hdr->primary_ref_frame;
++
++ memset(params->RefFrameMapTextureIndex, 0xFF, sizeof(params->RefFrameMapTextureIndex));
++ for (int i = 0; i < DAV1D_REFS_PER_FRAME; i++) {
++ const int idx = fc->frame_hdr->refidx[i];
++ if (c->refs[idx].p.p.data[0]) {
++ params->frame_refs[i].Index = idx;
++ } else {
++ params->frame_refs[i].Index = DXVA_INVALID_PICTURE_INDEX;
++ }
++ memcpy(params->frame_refs[i].wmmat, fc->frame_hdr->gmv[i].matrix, 6 * sizeof(int));
++ params->frame_refs[i].wmtype = (int)fc->frame_hdr->gmv[i].type; // enum same as dxva
++ params->frame_refs[i].wminvalid = fc->frame_hdr->gmv[i].type == DAV1D_WM_TYPE_IDENTITY;
++ params->frame_refs[i].width = c->refs[idx].p.p.p.w;
++ params->frame_refs[i].height = c->refs[idx].p.p.p.h;
++ }
++ for (int i = 0; i < 8; i++) {
++ params->RefFrameMapTextureIndex[i] = (uint8_t)c->refs[i].p.p.dxva_picture_index;
++ }
++
++ // Loop filter parameters
++ params->loop_filter.filter_level[0] = fc->frame_hdr->loopfilter.level_y[0];
++ params->loop_filter.filter_level[1] = fc->frame_hdr->loopfilter.level_y[1];
++ params->loop_filter.filter_level_u = fc->frame_hdr->loopfilter.level_u;
++ params->loop_filter.filter_level_v = fc->frame_hdr->loopfilter.level_v;
++ params->loop_filter.sharpness_level = fc->frame_hdr->loopfilter.sharpness;
++ params->loop_filter.mode_ref_delta_enabled = fc->frame_hdr->loopfilter.mode_ref_delta_enabled;
++ params->loop_filter.mode_ref_delta_update = fc->frame_hdr->loopfilter.mode_ref_delta_update;
++ params->loop_filter.delta_lf_multi = fc->frame_hdr->delta.lf.multi;
++ params->loop_filter.delta_lf_present = fc->frame_hdr->delta.lf.present;
++ params->loop_filter.mode_deltas[0] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[0];
++ params->loop_filter.mode_deltas[1] = fc->frame_hdr->loopfilter.mode_ref_deltas.mode_delta[1];
++ for (int i = 0; i < DAV1D_TOTAL_REFS_PER_FRAME; i++) {
++ params->loop_filter.ref_deltas[i] = fc->frame_hdr->loopfilter.mode_ref_deltas.ref_delta[i];
++ }
++ params->loop_filter.delta_lf_res = fc->frame_hdr->delta.lf.res_log2;
++ char haverestoration = fc->frame_hdr->restoration.type[0] || fc->frame_hdr->restoration.type[1] || fc->frame_hdr->restoration.type[2];
++ for (int i = 0; i < 3; i++) {
++ params->loop_filter.frame_restoration_type[i] = dav1d_restoration_type_to_dxva(fc->frame_hdr->restoration.type[i]);
++ params->loop_filter.log2_restoration_unit_size[i] = haverestoration ? fc->frame_hdr->restoration.unit_size[min(i, 1)] : 8; // dav1d only tracks y and uv not y,u,v
++ }
++
++ // Quantization
++ params->quantization.delta_q_present = fc->frame_hdr->delta.q.present;
++ params->quantization.delta_q_res = fc->frame_hdr->delta.q.res_log2;
++ params->quantization.base_qindex = fc->frame_hdr->quant.yac;
++ params->quantization.y_dc_delta_q = fc->frame_hdr->quant.ydc_delta;
++ params->quantization.u_dc_delta_q = fc->frame_hdr->quant.udc_delta;
++ params->quantization.v_dc_delta_q = fc->frame_hdr->quant.vdc_delta;
++ params->quantization.u_ac_delta_q = fc->frame_hdr->quant.uac_delta;
++ params->quantization.v_ac_delta_q = fc->frame_hdr->quant.vac_delta;
++ if (fc->frame_hdr->quant.qm)
++ {
++ params->quantization.qm_y = fc->frame_hdr->quant.qm_y;
++ params->quantization.qm_u = fc->frame_hdr->quant.qm_u;
++ params->quantization.qm_v = fc->frame_hdr->quant.qm_v;
++ }
++ else
++ {
++ params->quantization.qm_y = DXVA_INVALID_QM;
++ params->quantization.qm_u = DXVA_INVALID_QM;
++ params->quantization.qm_v = DXVA_INVALID_QM;
++ }
++
++ // Cdef parameters
++ params->cdef.damping = fc->frame_hdr->cdef.damping - 3;
++ params->cdef.bits = fc->frame_hdr->cdef.n_bits;
++ for (int i = 0; i < DAV1D_MAX_CDEF_STRENGTHS; i++) {
++ params->cdef.y_strengths[i].primary = fc->frame_hdr->cdef.y_strength[i] >> 2;
++ params->cdef.y_strengths[i].secondary = fc->frame_hdr->cdef.y_strength[i] & 0x3;
++ params->cdef.uv_strengths[i].primary = fc->frame_hdr->cdef.uv_strength[i] >> 2;
++ params->cdef.uv_strengths[i].secondary = fc->frame_hdr->cdef.uv_strength[i] & 0x3;
++ }
++
++ // Segmentation
++ params->segmentation.enabled = fc->frame_hdr->segmentation.enabled;
++ params->segmentation.update_map = fc->frame_hdr->segmentation.update_map;
++ params->segmentation.update_data = fc->frame_hdr->segmentation.update_data;
++ params->segmentation.temporal_update = fc->frame_hdr->segmentation.temporal;
++ for (int i = 0; i < DAV1D_MAX_SEGMENTS; i++) {
++ params->segmentation.feature_mask[i].alt_q = fc->frame_hdr->segmentation.seg_data.d[i].delta_q != 0;
++ params->segmentation.feature_data[i][0] = fc->frame_hdr->segmentation.seg_data.d[i].delta_q;
++
++ params->segmentation.feature_mask[i].alt_lf_y_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v != 0;
++ params->segmentation.feature_data[i][1] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_v;
++
++ params->segmentation.feature_mask[i].alt_lf_y_h = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h != 0;
++ params->segmentation.feature_data[i][2] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_y_h;
++
++ params->segmentation.feature_mask[i].alt_lf_u = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u != 0;
++ params->segmentation.feature_data[i][3] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_u;
++
++ params->segmentation.feature_mask[i].alt_lf_v = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v != 0;
++ params->segmentation.feature_data[i][4] = fc->frame_hdr->segmentation.seg_data.d[i].delta_lf_v;
++
++ params->segmentation.feature_mask[i].ref_frame = fc->frame_hdr->segmentation.seg_data.d[i].ref != -1;
++ params->segmentation.feature_data[i][5] = fc->frame_hdr->segmentation.seg_data.d[i].ref == -1 ? 0 : fc->frame_hdr->segmentation.seg_data.d[i].ref;
++
++ params->segmentation.feature_mask[i].skip = fc->frame_hdr->segmentation.seg_data.d[i].skip != 0;
++ params->segmentation.feature_data[i][6] = fc->frame_hdr->segmentation.seg_data.d[i].skip;
++
++ params->segmentation.feature_mask[i].globalmv = fc->frame_hdr->segmentation.seg_data.d[i].globalmv != 0;
++ params->segmentation.feature_data[i][7] = fc->frame_hdr->segmentation.seg_data.d[i].globalmv;
++ }
++
++ // Film Grain
++ if (fc->frame_hdr->film_grain.present) {
++ params->film_grain.apply_grain = 1;
++ params->film_grain.scaling_shift_minus8 = fc->frame_hdr->film_grain.data.scaling_shift - 8;
++ params->film_grain.chroma_scaling_from_luma = fc->frame_hdr->film_grain.data.chroma_scaling_from_luma;
++ params->film_grain.ar_coeff_lag = fc->frame_hdr->film_grain.data.ar_coeff_lag;
++ params->film_grain.ar_coeff_shift_minus6 = (uint16_t)(fc->frame_hdr->film_grain.data.ar_coeff_shift - 6);
++ params->film_grain.grain_scale_shift = fc->frame_hdr->film_grain.data.grain_scale_shift;
++ params->film_grain.overlap_flag = fc->frame_hdr->film_grain.data.overlap_flag;
++ params->film_grain.clip_to_restricted_range = fc->frame_hdr->film_grain.data.clip_to_restricted_range;
++ params->film_grain.matrix_coeff_is_identity = fc->seq_hdr->mtrx == DAV1D_MC_IDENTITY;
++ params->film_grain.grain_seed = fc->frame_hdr->film_grain.data.seed;
++ memcpy(params->film_grain.scaling_points_y, fc->frame_hdr->film_grain.data.y_points, 14 * 2);
++ params->film_grain.num_y_points = fc->frame_hdr->film_grain.data.num_y_points;
++ memcpy(params->film_grain.scaling_points_cb, fc->frame_hdr->film_grain.data.uv_points[0], 10 * 2);
++ params->film_grain.num_cb_points = fc->frame_hdr->film_grain.data.num_uv_points[0];
++ memcpy(params->film_grain.scaling_points_cr, fc->frame_hdr->film_grain.data.uv_points[1], 10 * 2);
++ params->film_grain.num_cr_points = fc->frame_hdr->film_grain.data.num_uv_points[1];
++ for (int i = 0; i < 24; i++) {
++ params->film_grain.ar_coeffs_y[i] = (UCHAR)((int)fc->frame_hdr->film_grain.data.ar_coeffs_y[i] + 128);
++ }
++ for (int i = 0; i < 25; i++) {
++ params->film_grain.ar_coeffs_cb[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[0][i] + 128;
++ params->film_grain.ar_coeffs_cr[i] = fc->frame_hdr->film_grain.data.ar_coeffs_uv[1][i] + 128;
++ }
++ params->film_grain.cb_mult = fc->frame_hdr->film_grain.data.uv_mult[0] + 128;
++ params->film_grain.cb_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[0] + 128;
++ params->film_grain.cr_mult = fc->frame_hdr->film_grain.data.uv_mult[1] + 128;
++ params->film_grain.cr_luma_mult = fc->frame_hdr->film_grain.data.uv_luma_mult[1] + 128;
++ params->film_grain.cb_offset = fc->frame_hdr->film_grain.data.uv_offset[0] + 256;
++ params->film_grain.cr_offset = fc->frame_hdr->film_grain.data.uv_offset[1] + 256;
++ }
++
++ return 0;
++}
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c) {
++ int retval = DAV1D_ERR(ENOMEM);
++ DXVA_PicParams_AV1 *params = NULL;
++ DXVA_Tile_AV1 *tiles = NULL;
++ int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++ if ((retval = c->dxva.alloc_callback(c->dxva.cookie, &f->cur, ¶ms, &tiles, tile_count)) < 0)
++ return retval;
++
++ if ((retval = fill_picparams_struct(params, f, c)) < 0)
++ goto error;
++
++ f->dxva_params = params;
++ f->dxva_tiles = tiles;
++ return 0;
++
++error:
++ c->dxva.release_callback(c->dxva.cookie, params, tiles);
++ return retval;
++}
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f) {
++ int retval = DAV1D_ERR(ENOMEM);
++ uint8_t *bitstream = NULL;
++ int tile_count = (1 << f->frame_hdr->tiling.log2_cols) * (1 << f->frame_hdr->tiling.log2_rows);
++
++ // Construct tile list
++ // Tiles are copied into GPU memory as one contiguous block, with each
++ // tile having an entry in the tile list (DXVA_Tile_AV1); specifying its offset
++ // into the GPU buffer and its size.
++ int tile_row = 0, tile_col = 0, tile_index = 0;
++ size_t total_data_offset = 0;
++ DXVA_Tile_AV1 *current_tile = f->dxva_tiles;
++ for (int i = 0; i < f->n_tile_data; i++) {
++ const uint8_t *data = f->tile[i].data.data;
++ size_t size = f->tile[i].data.sz;
++
++ for (int j = f->tile[i].start; j <= f->tile[i].end; j++) {
++ size_t tile_sz;
++ if (j == f->tile[i].end) {
++ tile_sz = size;
++ } else {
++ if (f->frame_hdr->tiling.n_bytes > size)
++ goto error;
++ tile_sz = 0;
++ for (unsigned k = 0; k < f->frame_hdr->tiling.n_bytes; k++) {
++ tile_sz |= (unsigned)*data++ << (k * 8);
++ total_data_offset++;
++ }
++ tile_sz++;
++ size -= f->frame_hdr->tiling.n_bytes;
++ if (tile_sz > size)
++ goto error;
++ }
++
++ if (tile_index > tile_count)
++ goto error;
++
++ current_tile->DataOffset = (uint32_t)total_data_offset;
++ current_tile->DataSize = (uint32_t)tile_sz;
++ current_tile->row = tile_row;
++ current_tile->column = tile_col++;
++ // large scale tile decoding process is not supported
++ current_tile->anchor_frame = DXVA_INVALID_PICTURE_INDEX;
++
++ if (tile_col == f->frame_hdr->tiling.cols) {
++ tile_col = 0;
++ tile_row++;
++ }
++
++ total_data_offset += tile_sz;
++ size -= tile_sz;
++ data += tile_sz;
++ tile_index++;
++ current_tile++;
++ }
++ }
++
++ if ((retval = f->c->dxva.decode_callback(f->c->dxva.cookie, f->dxva_params, f->dxva_tiles, tile_count, &f->cur, f->tile, f->n_tile_data)) < 0)
++ goto error;
++
++ retval = 0;
++
++error:
++ f->c->dxva.release_callback(f->c->dxva.cookie, f->dxva_params, f->dxva_tiles);
++
++ for (int i = 0; i < 7; i++) {
++ if (f->refp[i].p.data[0])
++ dav1d_thread_picture_unref(&f->refp[i]);
++ dav1d_ref_dec(&f->ref_mvs_ref[i]);
++ }
++
++ dav1d_picture_unref_internal(&f->cur);
++ dav1d_thread_picture_unref(&f->sr_cur);
++ dav1d_cdf_thread_unref(&f->in_cdf);
++ if (f->frame_hdr->refresh_context) {
++ dav1d_cdf_thread_signal(&f->out_cdf);
++ dav1d_cdf_thread_unref(&f->out_cdf);
++ }
++ dav1d_ref_dec(&f->cur_segmap_ref);
++ dav1d_ref_dec(&f->prev_segmap_ref);
++ dav1d_ref_dec(&f->mvs_ref);
++ dav1d_ref_dec(&f->seq_hdr_ref);
++ dav1d_ref_dec(&f->frame_hdr_ref);
++
++ for (int i = 0; i < f->n_tile_data; i++)
++ dav1d_data_unref_internal(&f->tile[i].data);
++
++ return retval;
++}
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++ *pic = (DXVA_PicParams_AV1*)calloc(1, sizeof(DXVA_PicParams_AV1));
++ if (!*pic) return DAV1D_ERR(ENOMEM);
++ *tiles = (DXVA_Tile_AV1*)calloc(n_tiles, sizeof(DXVA_Tile_AV1));
++ if (!*tiles) return DAV1D_ERR(ENOMEM);
++ return 0;
++}
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++ free(pic);
++ free(tiles);
++}
++
++#else // defined(_WIN32)
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++ return DAV1D_ERR(EINVAL);
++}
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++}
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f) {
++ return DAV1D_ERR(EINVAL);
++}
++
++#endif
+diff --git a/src/dxva.h b/src/dxva.h
+new file mode 100644
+index 0000000..b695a9e
+--- /dev/null
++++ b/src/dxva.h
+@@ -0,0 +1,44 @@
++/*
++ * Copyright � 2020, VideoLAN and dav1d authors
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ * list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef DAV1D_DXVA_H
++#define DAV1D_DXVA_H
++
++#include <stddef.h>
++#include <stdint.h>
++
++#include "common.h"
++#include "headers.h"
++
++int dav1d_allocate_frame_dxva(Dav1dFrameContext *const f, Dav1dContext *const c);
++
++int dav1d_decode_frame_dxva(Dav1dFrameContext *const f);
++
++int dav1d_default_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles);
++
++void dav1d_default_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles);
++
++#endif /* DAV1D_DXVA_H */
+diff --git a/src/internal.h b/src/internal.h
+index 07f5676..043d28c 100644
+--- a/src/internal.h
++++ b/src/internal.h
+@@ -67,11 +67,6 @@ typedef struct Dav1dDSPContext {
+ Dav1dLoopRestorationDSPContext lr;
+ } Dav1dDSPContext;
+
+-struct Dav1dTileGroup {
+- Dav1dData data;
+- int start, end;
+-};
+-
+ struct Dav1dContext {
+ Dav1dFrameContext *fc;
+ unsigned n_fc;
+@@ -135,6 +130,7 @@ struct Dav1dContext {
+ int drain;
+
+ Dav1dLogger logger;
++ Dav1dDXVA dxva;
+ };
+
+ struct Dav1dFrameContext {
+@@ -157,6 +153,8 @@ struct Dav1dFrameContext {
+ struct Dav1dTileGroup *tile;
+ int n_tile_data_alloc;
+ int n_tile_data;
++ DXVA_PicParams_AV1 *dxva_params; // DXVA
++ DXVA_Tile_AV1 *dxva_tiles; // DXVA
+
+ // for scalable references
+ struct ScalableMotionParams {
+diff --git a/src/lib.c b/src/lib.c
+index 82af64a..88f4da2 100644
+--- a/src/lib.c
++++ b/src/lib.c
+@@ -50,6 +50,7 @@
+ #include "src/ref.h"
+ #include "src/thread_task.h"
+ #include "src/wedge.h"
++#include "src/dxva.h"
+
+ static COLD void init_internal(void) {
+ dav1d_init_cpu();
+@@ -75,6 +76,15 @@ COLD void dav1d_default_settings(Dav1dSettings *const s) {
+ s->operating_point = 0;
+ s->all_layers = 1; // just until the tests are adjusted
+ s->frame_size_limit = 0;
++ s->dxva.cookie = NULL;
++ s->dxva.decode_callback = NULL;
++#ifdef _WIN32
++ s->dxva.alloc_callback = dav1d_default_dxva_alloc;
++ s->dxva.release_callback = dav1d_default_dxva_release;
++#else // !_WIN32
++ s->dxva.alloc_callback = NULL;
++ s->dxva.release_callback = NULL;
++#endif // !_WIN32
+ }
+
+ static void close_internal(Dav1dContext **const c_out, int flush);
+@@ -111,6 +121,12 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+ DAV1D_ERR(EINVAL));
+ validate_input_or_ret(s->operating_point >= 0 &&
+ s->operating_point <= 31, DAV1D_ERR(EINVAL));
++ validate_input_or_ret(s->dxva.decode_callback == NULL ||
++ s->dxva.alloc_callback != NULL,
++ DAV1D_ERR(EINVAL));
++ validate_input_or_ret(s->dxva.decode_callback == NULL ||
++ s->dxva.release_callback != NULL,
++ DAV1D_ERR(EINVAL));
+
+ pthread_attr_t thread_attr;
+ if (pthread_attr_init(&thread_attr)) return DAV1D_ERR(ENOMEM);
+@@ -124,6 +140,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+
+ c->allocator = s->allocator;
+ c->logger = s->logger;
++ c->dxva = s->dxva;
+ c->apply_grain = s->apply_grain;
+ c->operating_point = s->operating_point;
+ c->all_layers = s->all_layers;
+@@ -142,7 +159,7 @@ COLD int dav1d_open(Dav1dContext **const c_out, const Dav1dSettings *const s) {
+
+ c->frame_thread.flush = &c->frame_thread.flush_mem;
+ atomic_init(c->frame_thread.flush, 0);
+- c->n_fc = s->n_frame_threads;
++ c->n_fc = s->dxva.decode_callback ? 1 : s->n_frame_threads;
+ c->fc = dav1d_alloc_aligned(sizeof(*c->fc) * s->n_frame_threads, 32);
+ if (!c->fc) goto error;
+ memset(c->fc, 0, sizeof(*c->fc) * s->n_frame_threads);
+diff --git a/src/meson.build b/src/meson.build
+index fd8ad02..91cb744 100644
+--- a/src/meson.build
++++ b/src/meson.build
+@@ -50,6 +50,12 @@ libdav1d_sources = files(
+ 'wedge.c',
+ )
+
++if host_machine.system() == 'windows'
++ libdav1d_sources += files(
++ 'dxva.c',
++ )
++endif
++
+ # libdav1d bitdepth source files
+ # These files are compiled for each bitdepth with
+ # `BITDEPTH` defined to the currently built bitdepth.
+diff --git a/tools/dav1d.c b/tools/dav1d.c
+index 4b97a9f..3a121e1 100644
+--- a/tools/dav1d.c
++++ b/tools/dav1d.c
+@@ -57,6 +57,10 @@
+
+ #include "dav1d_cli_parse.h"
+
++#ifdef _WIN32
++#include "dav1d_cli_dxva.h"
++#endif
++
+ static uint64_t get_time_nanos(void) {
+ #ifdef _WIN32
+ LARGE_INTEGER frequency;
+@@ -150,6 +154,9 @@ int main(const int argc, char *const *const argv) {
+ double i_fps;
+ FILE *frametimes = NULL;
+ const char *version = dav1d_version();
++#ifdef _WIN32
++ Dav1dDXVAInfo dxva;
++#endif
+
+ if (strcmp(version, DAV1D_VERSION)) {
+ fprintf(stderr, "Version mismatch (library: %s, executable: %s)\n",
+@@ -197,6 +204,12 @@ int main(const int argc, char *const *const argv) {
+ if (cli_settings.limit != 0 && cli_settings.limit < total)
+ total = cli_settings.limit;
+
++#ifdef _WIN32
++ if (cli_settings.dxva) {
++ dxva_init(&lib_settings, &dxva);
++ }
++#endif
++
+ if ((res = dav1d_open(&c, &lib_settings)))
+ return EXIT_FAILURE;
+
+@@ -235,6 +248,11 @@ int main(const int argc, char *const *const argv) {
+ }
+ res = 0;
+ } else {
++#ifdef _WIN32
++ if (cli_settings.dxva) {
++ dxva_lock(&p, &dxva);
++ }
++#endif
+ if (!n_out) {
+ if ((res = output_open(&out, cli_settings.muxer,
+ cli_settings.outputfile,
+@@ -272,6 +290,11 @@ int main(const int argc, char *const *const argv) {
+ break;
+ }
+ } else {
++#ifdef _WIN32
++ if (cli_settings.dxva) {
++ dxva_lock(&p, &dxva);
++ }
++#endif
+ if (!n_out) {
+ if ((res = output_open(&out, cli_settings.muxer,
+ cli_settings.outputfile,
+diff --git a/tools/dav1d_cli_dxva.c b/tools/dav1d_cli_dxva.c
+new file mode 100644
+index 0000000..7415cd6
+--- /dev/null
++++ b/tools/dav1d_cli_dxva.c
+@@ -0,0 +1,403 @@
++/*
++ * Copyright © 2018, VideoLAN and dav1d authors
++ * Copyright © 2018, Two Orioles, LLC
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ * list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifdef _WIN32
++#define COBJMACROS // this makes COM in C more reasonable
++#include <initguid.h>
++#include "config.h"
++#include "vcs_version.h"
++#include "cli_config.h"
++#include "dav1d_cli_dxva.h"
++#include "dav1d/dxva_av1.h"
++
++#define DXVA_INVALID_PICTURE_INDEX 0xFFu
++
++// These may not be defined depending on the Windows SDK version used
++// We can just re-define them without issue as they will never change
++#if (WDK_NTDDI_VERSION <= NTDDI_WIN10_19H1)
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile0, 0xb8be4ccb, 0xcf53, 0x46ba, 0x8d, 0x59, 0xd6, 0xb8, 0xa6, 0xda, 0x5d, 0x2a);
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile1, 0x6936ff0f, 0x45b1, 0x4163, 0x9c, 0xc1, 0x64, 0x6e, 0xf6, 0x94, 0x61, 0x08);
++DEFINE_GUID(DXVA_ModeAV1_VLD_Profile2, 0x0c5f2aa1, 0xe541, 0x4089, 0xbb, 0x7b, 0x98, 0x11, 0x0a, 0x19, 0xd7, 0xc8);
++DEFINE_GUID(DXVA_ModeAV1_VLD_12bit_Profile2, 0x17127009, 0xa00f, 0x4ce1, 0x99, 0x4e, 0xbf, 0x40, 0x81, 0xf6, 0xf3, 0xf0);
++DEFINE_GUID(DXVA_ModeAV1_VLD_12bit_Profile2_420, 0x2d80bed6, 0x9cac, 0x4835, 0x9e, 0x91, 0x32, 0x7b, 0xbc, 0x4f, 0x9e, 0xe8);
++#endif
++
++// This is an example implementation of DXVA using the DX11 interface. DX9 is similar to this,
++// while DX12 is very different (most seemingly unused parameters in the callbacks are for DX12).
++// To keep the sample simple, this does not implement other important features like:
++// HW-DRM, histogram generation, device loss checks, array of textures support,
++// downsampling, and no-recreate on DRC
++
++// NOTE: this file uses quite a lot of COM, since that is how D3D11 & DXVA are defined
++// it is highly recommended to use C++ for this, since the code will be cleaner.
++
++int insert_freelist(int index, Dav1dDXVAInfo *dxva)
++{
++ dxva->freeList[dxva->freeListWrite % MAX_SAMPLE_POOL_SIZE] = index;
++ dxva->freeListWrite++;
++ return 0;
++}
++
++int dxva_create_decoder(Dav1dPicture *p, Dav1dDXVAInfo *dxva) {
++ D3D11_VIDEO_DECODER_DESC desc;
++ D3D11_VIDEO_DECODER_CONFIG config = {0};
++ // DXVA needs to know the maximums for this sequence
++ desc.SampleWidth = p->seq_hdr->max_width;
++ desc.SampleHeight = p->seq_hdr->max_height;
++ switch(p->seq_hdr->profile) {
++ case 0:
++ desc.Guid = DXVA_ModeAV1_VLD_Profile0;
++ break;
++ case 1:
++ desc.Guid = DXVA_ModeAV1_VLD_Profile1;
++ break;
++ case 2:
++ desc.Guid = DXVA_ModeAV1_VLD_Profile2;
++ break;
++ default:
++ return DAV1D_ERR(1);
++ }
++ switch(p->seq_hdr->layout) {
++ case DAV1D_PIXEL_LAYOUT_I400:
++ desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_R16_UNORM : DXGI_FORMAT_R8_UNORM;
++ break;
++ case DAV1D_PIXEL_LAYOUT_I420:
++ desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_P016 : DXGI_FORMAT_NV12;
++ break;
++ case DAV1D_PIXEL_LAYOUT_I422:
++ desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_Y216 : DXGI_FORMAT_YUY2;
++ break;
++ case DAV1D_PIXEL_LAYOUT_I444:
++ desc.OutputFormat = p->seq_hdr->hbd > 0 ? DXGI_FORMAT_Y416 : DXGI_FORMAT_AYUV;
++ break;
++ default:
++ return DAV1D_ERR(1);
++ }
++
++ // only re-create everything if configuration changed
++ if(dxva->decoder != NULL && memcmp(&desc, &dxva->currentDecoderDesc, sizeof(desc)) == 0)
++ return 0;
++ config.ConfigBitstreamRaw = 1;
++
++ if (dxva->decoder != NULL)
++ ID3D11VideoDevice_Release(dxva->decoder);
++ if(FAILED(ID3D11VideoDevice_CreateVideoDecoder(dxva->vdevice, &desc, &config, &dxva->decoder)))
++ return DAV1D_ERR(1);
++ dxva->currentDecoderDesc = desc;
++
++ D3D11_TEXTURE2D_DESC texture = {0};
++ texture.Width = p->seq_hdr->max_width;
++ texture.Height = p->seq_hdr->max_height;
++ texture.MipLevels = 1;
++ texture.ArraySize = MAX_SAMPLE_POOL_SIZE;
++ texture.Format = desc.OutputFormat;
++ texture.SampleDesc.Count = 1;
++ texture.Usage = D3D11_USAGE_DEFAULT;
++ // In applications that display video or do further processing on the GPU
++ // you should also be setting D3D11_BIND_SHADER_RESOURCE here.
++ texture.BindFlags = D3D11_BIND_DECODER;
++
++ if (dxva->textures != NULL)
++ ID3D11Texture2D_Release(dxva->textures);
++ if(FAILED(ID3D11Device_CreateTexture2D(dxva->device, &texture, NULL, &dxva->textures)))
++ return DAV1D_ERR(ENOMEM);
++ dxva->currentTextureDesc = texture;
++
++ // This staging texture lets us readback from the GPU
++ texture.ArraySize = 1;
++ texture.Usage = D3D11_USAGE_STAGING;
++ texture.BindFlags = 0;
++ texture.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
++
++ if (dxva->stage != NULL)
++ ID3D11Texture2D_Release(dxva->stage);
++ if(FAILED(ID3D11Device_CreateTexture2D(dxva->device, &texture, NULL, &dxva->stage)))
++ return DAV1D_ERR(ENOMEM);
++
++ dxva->freeListRead = 0;
++ dxva->freeListWrite = 0;
++
++ D3D11_VIDEO_DECODER_OUTPUT_VIEW_DESC view = {0};
++ view.DecodeProfile = desc.Guid;
++ view.ViewDimension = D3D11_VDOV_DIMENSION_TEXTURE2D;
++
++ for(int i = 0; i < MAX_SAMPLE_POOL_SIZE; i++) {
++ if (dxva->views[i] != NULL)
++ ID3D11VideoDecoderOutputView_Release(dxva->views[i]);
++ if(FAILED(ID3D11VideoDevice_CreateVideoDecoderOutputView(dxva->vdevice, (ID3D11Resource*)dxva->textures, &view, dxva->views + i)))
++ return DAV1D_ERR(ENOMEM);
++
++ insert_freelist(i, dxva);
++ }
++
++ return 0;
++}
++
++int av1_dxva_alloc_picture(Dav1dPicture *pic, void *cookie) {
++ Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++
++ if(dxva_create_decoder(pic, dxva))
++ return DAV1D_ERR(ENOMEM);
++
++ uint16_t sampleIndex = dxva->freeList[dxva->freeListRead % MAX_SAMPLE_POOL_SIZE];
++ if (dxva->freeListRead == dxva->freeListWrite) // no free samples!
++ return DAV1D_ERR(ENOMEM);
++
++ if(FAILED(ID3D11VideoContext_DecoderBeginFrame(dxva->vcontext, dxva->decoder, dxva->views[sampleIndex], 0, NULL)))
++ return DAV1D_ERR(1);
++
++ dxva->freeListRead++;
++ pic->dxva_picture_index = sampleIndex;
++ pic->data[0] = (void*)(dxva->views + sampleIndex);
++ return 0;
++}
++
++void av1_dxva_release_picture(Dav1dPicture *pic, void *cookie) {
++ Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++
++ if (pic->dxva_picture_index != DXVA_INVALID_PICTURE_INDEX)
++ {
++ insert_freelist(pic->dxva_picture_index, dxva);
++ pic->dxva_picture_index = DXVA_INVALID_PICTURE_INDEX;
++ }
++
++ if(pic->data[0] != NULL) {
++ dxva->defaultAllocator.release_picture_callback(pic, dxva->defaultAllocator.cookie);
++ ID3D11DeviceContext_Unmap(dxva->context, (ID3D11Resource*)dxva->stage, 0);
++ pic->data[0] = 0;
++ }
++}
++
++int av1_dxva_decode(void *cookie, DXVA_PicParams_AV1 *picture_parameters, DXVA_Tile_AV1 *tiles, const int n_tiles, Dav1dPicture *output_picture, Dav1dTileGroup* tile_groups, int tile_group_count) {
++ Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++ uint8_t* bitstream_target = NULL;
++ size_t bitstream_size = 0;
++ int retval = 0;
++
++ for (int i = 0; i < tile_group_count; i++)
++ bitstream_size += tile_groups[i].data.sz;
++
++ uint32_t size_allocated = 0;
++ if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_BITSTREAM, &size_allocated, &bitstream_target)) || size_allocated < (uint32_t)bitstream_size)
++ return DAV1D_ERR(ENOMEM);
++
++ memset(bitstream_target, 0, size_allocated);
++
++ // this is a GPU bitstream upload
++ for (int i = 0; i < tile_group_count; i++) {
++ const uint8_t *data = tile_groups[i].data.data;
++ size_t size = tile_groups[i].data.sz;
++ memcpy(bitstream_target, data, size);
++ bitstream_target += size;
++ }
++
++ // Note: this API is badly non-intuitive. DX12's version (which actually uses the pointers provided above)
++ // is quite a bit more sane.
++ D3D11_VIDEO_DECODER_BUFFER_DESC decodeDesc[3];
++ decodeDesc[0].BufferType = D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS;
++ decodeDesc[0].DataSize = sizeof(DXVA_PicParams_AV1);
++ decodeDesc[1].BufferType = D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL;
++ decodeDesc[1].DataSize = sizeof(DXVA_Tile_AV1)*n_tiles;
++ decodeDesc[2].BufferType = D3D11_VIDEO_DECODER_BUFFER_BITSTREAM;
++ decodeDesc[2].DataSize = (UINT)bitstream_size;
++
++ if (FAILED(ID3D11VideoContext_SubmitDecoderBuffers(dxva->vcontext, dxva->decoder, 3, decodeDesc)))
++ retval = DAV1D_ERR(1);
++
++ ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_BITSTREAM);
++ return 0;
++}
++
++int av1_dxva_alloc(void *cookie, Dav1dPicture* picture, DXVA_PicParams_AV1 **pic, DXVA_Tile_AV1 **tiles, int n_tiles) {
++ UINT bufferSize = sizeof(DXVA_PicParams_AV1);
++ Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++ uint32_t size = 0;
++
++ if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS, &size, pic)) || size < sizeof(DXVA_PicParams_AV1))
++ return DAV1D_ERR(ENOMEM);
++
++ memset(*pic, 0, size);
++
++ if (FAILED(ID3D11VideoContext_GetDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL, &size, tiles)) || size < sizeof(DXVA_Tile_AV1) * n_tiles)
++ return DAV1D_ERR(ENOMEM);
++
++ memset(*tiles, 0, size);
++ return 0;
++}
++
++void av1_dxva_release(void *cookie, DXVA_PicParams_AV1 *pic, DXVA_Tile_AV1 *tiles) {
++ Dav1dDXVAInfo *dxva = (Dav1dDXVAInfo*)cookie;
++ ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_PICTURE_PARAMETERS);
++ ID3D11VideoContext_ReleaseDecoderBuffer(dxva->vcontext, dxva->decoder, D3D11_VIDEO_DECODER_BUFFER_SLICE_CONTROL);
++
++ ID3D11VideoContext_DecoderEndFrame(dxva->vcontext, dxva->decoder);
++}
++
++int dxva_init(Dav1dSettings *settings, Dav1dDXVAInfo *dxva)
++{
++ memset(dxva, 0, sizeof(*dxva));
++ dxva->defaultAllocator = settings->allocator;
++ dxva->freeListRead = 0;
++ dxva->freeListWrite = 0;
++
++ if(FAILED(D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, D3D11_CREATE_DEVICE_VIDEO_SUPPORT, NULL, 0, D3D11_SDK_VERSION, &dxva->device, NULL, &dxva->context)))
++ return DAV1D_ERR(1);
++ if(FAILED(ID3D11Device_QueryInterface(dxva->device, &IID_ID3D11VideoDevice, &dxva->vdevice)))
++ return DAV1D_ERR(1);
++ if(FAILED(ID3D11DeviceContext_QueryInterface(dxva->context, &IID_ID3D11VideoContext, &dxva->vcontext)))
++ return DAV1D_ERR(1);
++
++ settings->dxva.cookie = (void*)dxva;
++ settings->dxva.decode_callback = av1_dxva_decode;
++ settings->dxva.alloc_callback = av1_dxva_alloc;
++ settings->dxva.release_callback = av1_dxva_release;
++ settings->allocator.cookie = (void*)dxva;
++
++ // Note that actual decoder creation is delayed until we have the first picture
++ // this can be inefficient because decoder creation on the hardware can be slow
++ // but it keeps this code simple.
++ // Time to first picture can be reduced if you know the decoder Profile ahead of time
++ // and pre-create the DXVA decoder, for example from a container like MP4 etc...
++ return 0;
++}
++
++void dxva_shutdown(Dav1dDXVAInfo *dxva)
++{
++ if (dxva->device != NULL)
++ ID3D11Device_Release(dxva->device);
++ if (dxva->context != NULL)
++ ID3D11DeviceContext_Release(dxva->context);
++ if (dxva->vdevice != NULL)
++ ID3D11VideoDevice_Release(dxva->vdevice);
++ if (dxva->vcontext != NULL)
++ ID3D11VideoContext_Release(dxva->vcontext);
++ if (dxva->decoder != NULL)
++ ID3D11VideoDecoder_Release(dxva->decoder);
++ if (dxva->stage != NULL)
++ ID3D11Texture2D_Release(dxva->stage);
++ if (dxva->textures != NULL)
++ ID3D11Texture2D_Release(dxva->textures);
++ for(int i = 0; i < MAX_SAMPLE_POOL_SIZE; i++) {
++ if (dxva->views[i] != NULL)
++ ID3D11VideoDecoderOutputView_Release(dxva->views[i]);
++ }
++}
++
++int dxgi_texture_bitdepth(DXGI_FORMAT fmt) {
++ switch(fmt) {
++ case DXGI_FORMAT_R16_UNORM:
++ case DXGI_FORMAT_P016:
++ case DXGI_FORMAT_Y216:
++ case DXGI_FORMAT_Y416:
++ return 16;
++ case DXGI_FORMAT_R8_UNORM:
++ case DXGI_FORMAT_NV12:
++ case DXGI_FORMAT_YUY2:
++ case DXGI_FORMAT_AYUV:
++ return 8;
++ }
++ return 0;
++}
++
++int dxgi_texture_is_2plane(DXGI_FORMAT fmt) {
++ return fmt == DXGI_FORMAT_P016 || fmt == DXGI_FORMAT_NV12;
++}
++
++int layout_to_subsampleH(enum Dav1dPixelLayout layout) {
++ switch(layout){
++ case DAV1D_PIXEL_LAYOUT_I420:
++ case DAV1D_PIXEL_LAYOUT_I422:
++ return 1;
++ }
++ return 0;
++}
++
++int layout_to_subsampleW(enum Dav1dPixelLayout layout) {
++ return layout == DAV1D_PIXEL_LAYOUT_I420;
++}
++
++int dxva_lock(Dav1dPicture *p, Dav1dDXVAInfo *dxva)
++{
++ D3D11_MAPPED_SUBRESOURCE mapped;
++ ID3D11DeviceContext_CopySubresourceRegion(dxva->context, (ID3D11Resource*)dxva->stage, 0, 0, 0, 0, (ID3D11Resource*)dxva->textures, p->dxva_picture_index, NULL);
++ if(FAILED(ID3D11DeviceContext_Map(dxva->context, (ID3D11Resource*)dxva->stage, 0, D3D11_MAP_READ, 0, &mapped)))
++ return DAV1D_ERR(1);
++
++ if(dxva->defaultAllocator.alloc_picture_callback(p, dxva->defaultAllocator.cookie))
++ return DAV1D_ERR(ENOMEM);
++
++ // This is not an optimal surface format conversion, it is just as simple as possible
++ int texture_bitdepth = dxgi_texture_bitdepth(dxva->currentTextureDesc.Format);
++ uint8_t *outputY = p->data[0];
++ uint8_t *outputU = p->data[1];
++ uint8_t *outputV = p->data[2];
++ uint8_t *inputY = mapped.pData;
++ uint8_t *inputU = inputY + (dxgi_texture_is_2plane(dxva->currentTextureDesc.Format) ? dxva->currentTextureDesc.Height * mapped.RowPitch : 0);
++ uint8_t *inputV = inputU;
++ uint8_t bpmp_out = p->p.bpc > 8 ? 2 : 1;
++ uint8_t bpmp_in = texture_bitdepth > 8 ? 2 : 1;
++ if(dxva->currentTextureDesc.Format == DXGI_FORMAT_YUY2 || dxva->currentTextureDesc.Format == DXGI_FORMAT_Y216)
++ {
++ inputU += bpmp_in;
++ inputV += bpmp_in * 3;
++ bpmp_in = 2;
++ }
++
++ if(dxva->currentTextureDesc.Format == DXGI_FORMAT_AYUV || dxva->currentTextureDesc.Format == DXGI_FORMAT_Y416)
++ {
++ inputY += bpmp_in * 2;
++ inputU += bpmp_in;
++ inputV += bpmp_in * 3;
++ bpmp_in *= 4;
++ }
++
++ for(int y = 0; y < p->p.h; y++) {
++ for(int x = 0; x < p->p.w; x++) {
++ outputY[x*bpmp_out] = inputY[x*bpmp_in];
++ }
++ outputY += p->stride[0];
++ inputY += mapped.RowPitch;
++ }
++ if(outputU && outputV)
++ {
++ int subsampleH = layout_to_subsampleH(p->p.layout);
++ int subsampleW = layout_to_subsampleW(p->p.layout);
++ for(int y = 0; y < p->p.h >> subsampleH; y++) {
++ for(int x = 0; x < p->p.w >> subsampleW; x++) {
++ outputU[x*bpmp_out] = inputU[x*bpmp_in];
++ outputV[x*bpmp_out] = inputV[x*bpmp_in];
++ }
++ outputU += p->stride[1];
++ outputV += p->stride[1];
++ inputU += mapped.RowPitch;
++ inputV += mapped.RowPitch;
++ }
++ }
++ return 0;
++}
++
++#endif // _WIN32
+diff --git a/tools/dav1d_cli_dxva.h b/tools/dav1d_cli_dxva.h
+new file mode 100644
+index 0000000..a33ad5b
+--- /dev/null
++++ b/tools/dav1d_cli_dxva.h
+@@ -0,0 +1,63 @@
++/*
++ * Copyright © 2018, VideoLAN and dav1d authors
++ * Copyright © 2018, Two Orioles, LLC
++ * All rights reserved.
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions are met:
++ *
++ * 1. Redistributions of source code must retain the above copyright notice, this
++ * list of conditions and the following disclaimer.
++ *
++ * 2. Redistributions in binary form must reproduce the above copyright notice,
++ * this list of conditions and the following disclaimer in the documentation
++ * and/or other materials provided with the distribution.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
++ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
++ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
++ * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
++ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
++ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
++ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
++ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#ifndef DAV1D_CLI_DXVA_H
++#define DAV1D_CLI_DXVA_H
++
++#include "dav1d/dav1d.h"
++#include <windows.h>
++#include <d3d11.h>
++
++#define MAX_SAMPLE_POOL_SIZE 10
++
++typedef struct {
++ ID3D11Device *device;
++ ID3D11DeviceContext *context;
++ ID3D11VideoDevice *vdevice;
++ ID3D11VideoContext *vcontext;
++ ID3D11Texture2D *textures;
++ ID3D11VideoDecoder *decoder;
++ ID3D11VideoDecoderOutputView *views[MAX_SAMPLE_POOL_SIZE];
++ ID3D11Texture2D *stage;
++ D3D11_VIDEO_DECODER_DESC currentDecoderDesc;
++ D3D11_TEXTURE2D_DESC currentTextureDesc;
++ Dav1dPicAllocator defaultAllocator;
++
++ // Samples not currently used for decode
++ uint16_t freeList[MAX_SAMPLE_POOL_SIZE];
++ uint16_t freeListRead;
++ uint16_t freeListWrite;
++} Dav1dDXVAInfo;
++
++// Configures the decoder to use DXVA
++int dxva_init(Dav1dSettings *settings, Dav1dDXVAInfo *dxva);
++
++// Maps the DXVA texture associated with this picture into CPU
++// memory and points the Dav1dPicture's buffer at it
++int dxva_lock(Dav1dPicture *p, Dav1dDXVAInfo *dxva);
++
++#endif // DAV1D_CLI_DXVA_H
+diff --git a/tools/dav1d_cli_parse.c b/tools/dav1d_cli_parse.c
+index f363033..fa357cc 100644
+--- a/tools/dav1d_cli_parse.c
++++ b/tools/dav1d_cli_parse.c
+@@ -57,6 +57,7 @@ enum {
+ ARG_ALL_LAYERS,
+ ARG_SIZE_LIMIT,
+ ARG_CPU_MASK,
++ ARG_DXVA,
+ };
+
+ static const struct option long_opts[] = {
+@@ -79,6 +80,7 @@ static const struct option long_opts[] = {
+ { "alllayers", 1, NULL, ARG_ALL_LAYERS },
+ { "sizelimit", 1, NULL, ARG_SIZE_LIMIT },
+ { "cpumask", 1, NULL, ARG_CPU_MASK },
++ { "dxva", 0, NULL, ARG_DXVA },
+ { NULL, 0, NULL, 0 },
+ };
+
+@@ -122,7 +124,11 @@ static void usage(const char *const app, const char *const reason, ...) {
+ " --alllayers $num: output all spatial layers of a scalable AV1 bitstream (default: 1)\n"
+ " --sizelimit $num: stop decoding if the frame size exceeds the specified limit\n"
+ " --verify $md5: verify decoded md5. implies --muxer md5, no output\n"
+- " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n");
++ " --cpumask $mask: restrict permitted CPU instruction sets (0" ALLOWED_CPU_MASKS "; default: -1)\n"
++#ifdef _WIN32
++ " --dxva use DXVA accelerated decode (Win32 only)\n"
++#endif
++ );
+ exit(1);
+ }
+
+@@ -321,6 +327,10 @@ void parse(const int argc, char *const *const argv,
+ lib_settings->frame_size_limit = (unsigned) res;
+ break;
+ }
++ case ARG_DXVA: {
++ cli_settings->dxva = 1;
++ break;
++ }
+ case 'v':
+ fprintf(stderr, "%s\n", dav1d_version());
+ exit(0);
+diff --git a/tools/dav1d_cli_parse.h b/tools/dav1d_cli_parse.h
+index 11e88e1..036a5d7 100644
+--- a/tools/dav1d_cli_parse.h
++++ b/tools/dav1d_cli_parse.h
+@@ -46,6 +46,7 @@ typedef struct {
+ } realtime;
+ double realtime_fps;
+ unsigned realtime_cache;
++ int dxva;
+ } CLISettings;
+
+ void parse(const int argc, char *const *const argv,
+diff --git a/tools/meson.build b/tools/meson.build
+index 4b4217a..7592983 100644
+--- a/tools/meson.build
++++ b/tools/meson.build
+@@ -77,12 +77,18 @@ dav1d_sources = files(
+ 'dav1d_cli_parse.c',
+ )
+
++if host_machine.system() == 'windows'
++ dav1d_sources += files(
++ 'dav1d_cli_dxva.c',
++ )
++endif
++
+ dav1d = executable('dav1d',
+ dav1d_sources,
+ rev_target, cli_config_h_target,
+
+ link_with : [libdav1d, dav1d_input_objs, dav1d_output_objs],
+ include_directories : [dav1d_inc_dirs],
+- dependencies : [getopt_dependency, thread_dependency, rt_dependency],
++ dependencies : [getopt_dependency, thread_dependency, rt_dependency, d3d11_dependency],
+ install : true,
+ )
+--
+2.27.0.windows.1
+
diff --git a/contrib/src/dav1d/rules.mak b/contrib/src/dav1d/rules.mak
index fe0e222b166..5137f138df5 100644
--- a/contrib/src/dav1d/rules.mak
+++ b/contrib/src/dav1d/rules.mak
@@ -19,6 +19,7 @@ $(TARBALLS)/dav1d-$(DAV1D_VERSION).tar.xz:
dav1d: dav1d-$(DAV1D_VERSION).tar.xz .sum-dav1d
$(UNPACK)
$(APPLY) $(SRC)/dav1d/0001-SSE2-PIC-464ca6c2.patch
+ $(APPLY) $(SRC)/dav1d/0001-Add-DXVA-support.patch
$(MOVE)
.dav1d: dav1d crossfile.meson
--
2.26.2
More information about the vlc-devel
mailing list