[x265] [PATCH 2 of 3] Pulling x264 weight decision into/for x265 lookahead

Wed Nov 13 00:43:42 CET 2013

On Tue, Nov 12, 2013 at 5:49 AM, <shazeb at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Shazeb Nawaz Khan <shazeb at multicorewareinc.com>
> # Date 1384256247 -19800
> #      Tue Nov 12 17:07:27 2013 +0530
> # Node ID 21596a519ba8cc521dbc81f693c867cbca03fd3f
> # Parent  e7319fd46128b3bfcc826ea9be02896b316ed966
> Pulling x264 weight decision into/for x265 lookahead
>
> diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp      Tue Nov 12 17:06:03 2013 +0530
> +++ b/source/encoder/slicetype.cpp      Tue Nov 12 17:07:27 2013 +0530
> @@ -45,6 +45,14 @@
>
>  using namespace x265;
>
> +#define SET_WEIGHT( w, b, s, d, o )\
> +{\
> +    (w).inputWeight = (s);\
> +    (w).log2WeightDenom = (d);\
> +    (w).inputOffset = (o);\
> +    (w).bPresentFlag = b;\
> +}
> +
>  static inline int16_t median(int16_t a, int16_t b, int16_t c)
>  {
>      int16_t t = (a - b) & ((a - b) >> 31);
> @@ -190,16 +198,329 @@
>      return pic->m_lowres.satdCost;
>  }
>
> +/* makes a non-h265 weight (i.e. fix7), into an h265 weight */
> +static void x265_weight_get_h265( int weight_nonh264, int offset,
> wpScalingParam *w )
>

this should be made into a wpScalingParam method.. something like
setFromWeightAndOffset()

> +{
> +    w->inputOffset = offset;
> +    w->log2WeightDenom = 7;
> +    w->inputWeight = weight_nonh264;
> +    while( w->log2WeightDenom > 0 && (w->inputWeight > 127) )
> +    {
> +        w->log2WeightDenom--;
> +        w->inputWeight >>= 1;
> +    }
> +    w->inputWeight = X265_MIN( w->inputWeight, 127 );
> +}
> +
> +pixel* Lookahead::x265_weight_cost_init_luma( int b, int p0, pixel *dest )
>

weightCostInit()

white-space

> +{
> +    Lowres *fenc, *ref;
> +    fenc = frames[b];
> +    ref  = frames[p0];
> +    int ref0_distance = b - p0 - 1;
> +    /* Note: this will never run during lookahead as weights_analyse is
> only called if no
> +     * motion search has been done. */
> +    if( fenc->lowresMvs[0][ref0_distance][0].x != 0x7FFF )
> +    {
> +        int i_stride = fenc->lumaStride;
> +        int i_lines = fenc->lines;
> +        int i_width = fenc->width;
> +        int i_mb_xy = 0;
> +        pixel *p = dest;
> +
> +        for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
> +            for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
> +            {
> +                int mvx = fenc->lowresMvs[0][ref0_distance][i_mb_xy].x;
> +                int mvy = fenc->lowresMvs[0][ref0_distance][i_mb_xy].y;
> +                mvx;mvy;
> +                //h->mc.mc_luma( p+x, i_stride, ref->lowresPlane,
> i_stride,
> +                //               mvx+(x<<2), mvy+(y<<2), 8, 8,
> x264_weight_none );
> +            }
> +        x265_emms();
> +        return dest;
> +    }
> +    x265_emms();
> +    return ref->lowresPlane[0];
> +}
>

this whole function is not used for lookahead and should be removed, not to
mention it is unfinished

> +
> +static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src,
> intptr_t i_src_stride,
> +                       const wpScalingParam *weight, int i_width, int
> i_height )
>

white-space, do not copy i_ prefixes from x264

> +{
> +    int offset = weight->inputOffset << (g_bitDepth - 8);
>

use X265_DEPTH, not g_bitDepth

> +    int scale = weight->inputWeight;
> +    int denom = weight->log2WeightDenom;
> +    int corection = (IF_INTERNAL_PREC - X265_DEPTH);
>

correction

> +    if( denom >= 1 )
>

white-space.. please run uncrustify on the file before re-submitting

> +    {
> +        primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride,
> i_width, i_height, scale, (1<<(denom - 1 + corection)), (denom +
> corection), offset);
> +    }
> +    else
> +    {
> +        primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride,
> i_width, i_height, scale, 0 + corection, 0 + corection, offset);
> +    }
> +}
> +
> +unsigned int Lookahead::x265_weight_cost_luma( int b, pixel *src,
> wpScalingParam *w )
>

weightCostLuma()

> +{
> +    Lowres *fenc = frames[b];
> +    unsigned int cost = 0;
> +    int i_stride = fenc->lumaStride;
> +    int i_lines = fenc->lines;
> +    int i_width = fenc->width;
>

no i_ prefixes

> +    pixel *fenc_plane = fenc->lowresPlane[0];
> +    ALIGN_VAR_16( pixel, buf[8*8]);
> +    int pixoff = 0;
> +    int i_mb = 0;
> +
> +    if( w )
> +    {
> +        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
> +            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
> +            {
> +                // TO DO prepare full weighted plane
> +                mc_weight(buf, 8, &src[pixoff], i_stride, w, 8, 8);
> +                int cmp = primitives.satd[LUMA_8x8]( buf, 8,
> &fenc_plane[pixoff], i_stride );
> +                cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
> +            }
> +    }
> +    else
> +        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
> +            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
> +            {
> +                int cmp = primitives.satd[LUMA_8x8]( &src[pixoff],
> i_stride, &fenc_plane[pixoff], i_stride );
> +                cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
> +            }
> +    x265_emms();
> +    return cost;
> +}
>

this function is also not used, and should be removed

> +void Lookahead::weightsAnalyse(int b, int p0, int b_lookahead,
> wpScalingParam* w)
> +{
>

remove b_lookahead, assume it is always 1, simplify the code below.

> +    //int i_delta_index = b - p0 - 1;
>

remove

> +    Lowres *fenc, *ref;
> +    fenc = frames[b];
> +    ref  = frames[p0];
> +    /* epsilon is chosen to require at least a numerator of 127 (with
> denominator = 128) */
> +    const float epsilon = 1.f/128.f;
> +    wpScalingParam *weights = w;
>

yuck, just rename function argument

> +    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
>

> +    //SET_WEIGHT( weights[1], 0, 1, 0, 0 );
> +    //SET_WEIGHT( weights[2], 0, 1, 0, 0 );
>

remove these two

> +    float guess_scale[3];
> +    float fenc_mean[3];
> +    float ref_mean[3];
> +    for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
> +    {
> +        float fenc_var = (float) fenc->wp_ssd[plane] +
> !ref->wp_ssd[plane];
> +        float ref_var  = (float)  ref->wp_ssd[plane] +
> !ref->wp_ssd[plane];
> +        guess_scale[plane] = sqrtf( fenc_var / ref_var );
> +        fenc_mean[plane] = (float)fenc->wp_sum[plane] /
> ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 <<
> (BIT_DEPTH - 8))*/;
> +        ref_mean[plane]  = (float) ref->wp_sum[plane] /
> ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 <<
> (BIT_DEPTH - 8))*/;
> +    }
> +
> +    //int chroma_denom = 7;
> +    //if( !b_lookahead )
> +    //{
> +    //    /* make sure both our scale factors fit */
> +    //    while( chroma_denom > 0 )
> +    //    {
> +    //        float thresh = 127.f / (1<<chroma_denom);
> +    //        if( guess_scale[1] < thresh && guess_scale[2] < thresh )
> +    //            break;
> +    //        chroma_denom--;
> +    //    }
> +    //}
>

bye

> +
> +    /* Don't check chroma in lookahead, or if there wasn't a luma weight.
> */
> +    for( int plane = 0; plane <= 2 && !( plane && (
> /*!weights[0].weightfn*/ !fenc->isWeighted || b_lookahead ) ); plane++ )
> +    {
> +        int minoff, minscale, mindenom;
> +        unsigned int minscore = 0, origscore = 1;
> +        origscore;
> +        int found;
> +
> +        //early termination
> +        if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf(
> 1.f - guess_scale[plane] ) < epsilon )
> +        {
> +            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
> +            //printf("\nEarly\n");
> +            continue;
> +        }
> +
> +        if( plane )
> +        {
> +            //weights[plane].i_denom = chroma_denom;
> +            //weights[plane].i_scale = x264_clip3( round(
> guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
> +            //if( weights[plane].i_scale > 127 )
> +            //{
> +            //    weights[1].weightfn = weights[2].weightfn = NULL;
> +            //    break;
> +            //}
> +        }
> +        else
> +            x265_weight_get_h265( (int)( guess_scale[plane] * 128 + 0.5),
> 0, &weights[plane] );
> +
> +        found = 0;
> +        mindenom = weights[plane].log2WeightDenom;
> +        minscale = weights[plane].inputWeight;
> +        minoff = 0;
> +
> +        pixel *mcbuf = NULL;
> +        mcbuf;
> +        if( !plane )
> +        {
> +            if( !fenc->bIntraCalculated )
> +            {
> +                estimateFrameCost(b,b,b,0);
> +            }
> +            mcbuf = x265_weight_cost_init_luma( b, p0, NULL );
> +            origscore = minscore = x265_weight_cost_luma( b, mcbuf, NULL
> );
> +        }
> +        else
> +        {
> +            //if( CHROMA444 )
> +            //{
> +            //    mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref,
> h->mb.p_weight_buf[0], plane );
> +            //    origscore = minscore = x264_weight_cost_chroma444( h,
> fenc, mcbuf, NULL, plane );
> +            //}
> +            //else
> +            //{
> +                //pixel *dstu = h->mb.p_weight_buf[0];
> +                //pixel *dstv =
> h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
> +                //if( !chroma_initted++ )
> +                //    x264_weight_cost_init_chroma( h, fenc, ref, dstu,
> dstv );
> +                //mcbuf = plane == 1 ? dstu : dstv;
> +                //origscore = minscore = x264_weight_cost_chroma( h,
> fenc, mcbuf, NULL );
> +            //}
>
remove

> +        }
> +
> +        if( !minscore )
> +            continue;
> +
> +        /* Picked somewhat arbitrarily */
> +        static const uint8_t weight_check_distance[][2] =
> +        {
> +            {0,0},{0,0},{0,1},{0,1},
> +            {0,1},{0,1},{0,1},{1,1},
> +            {1,1},{2,1},{2,1},{4,2}
> +        };
> +        int scale_dist =  b_lookahead ? 0 :
> weight_check_distance[cfg->param.subpelRefine][0];
> +        int offset_dist = b_lookahead ? 0 :
> weight_check_distance[cfg->param.subpelRefine][1];
>

this all goes away

> +
> +        int start_scale  = Clip3( 0, 127, minscale - scale_dist );
> +        int end_scale    = Clip3( 0, 127, minscale + scale_dist );
> +        unsigned int s=0;
> +        for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
> +        {
> +            int cur_scale = i_scale;
> +            int cur_offset = (int) (fenc_mean[plane] - ref_mean[plane] *
> cur_scale / (1 << mindenom) + 0.5f * b_lookahead);
> +            if( cur_offset < - 128 || cur_offset > 127 )
> +            {
> +                /* Rescale considering the constraints on cur_offset. We
> do it in this order
> +                 * because scale has a much wider range than offset
> (because of denom), so
> +                 * it should almost never need to be clamped. */
> +                cur_offset = Clip3( -128, 127, cur_offset );
> +                cur_scale = (int) ((1 << mindenom) * (fenc_mean[plane] -
> cur_offset) / ref_mean[plane] + 0.5f);
> +                cur_scale = Clip3( 0, 127, cur_scale );
> +            }
> +            int start_offset = Clip3( -128, 127, cur_offset - offset_dist
> );
> +            int end_offset   = Clip3( -128, 127, cur_offset + offset_dist
> );
> +            for( int i_off = start_offset; i_off <= end_offset; i_off++ )
> +            {
> +                SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off
> );
> +                //unsigned int s;
> +                //if( plane )
> +                //{
> +                //    //if( CHROMA444 )
> +                //    //    s = x264_weight_cost_chroma444( h, fenc,
> mcbuf, &weights[plane], plane );
> +                //    //else
> +                //        s = x264_weight_cost_chroma( h, fenc, mcbuf,
> &weights[plane] );
> +                //}
> +                //else
> +                {
> +                    s = x265_weight_cost_luma( b, mcbuf, &weights[plane]
> );
> +                }
> +                COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff,
> i_off, found, 1 );
> +
> +                // Don't check any more offsets if the previous one had a
> lower cost than the current one
> +                if( minoff == start_offset && i_off != start_offset )
> +                    break;
> +            }
> +        }
> +        x265_emms();
> +
> +        /* Use a smaller denominator if possible */
> +        if( !plane )
> +        {
> +            while( mindenom > 0 && !(minscale&1) )
> +            {
> +                mindenom--;
> +                minscale >>= 1;
> +            }
> +        }
> +
> +        /* FIXME: More analysis can be done here on SAD vs. SATD
> termination. */
> +        /* 0.2% termination derived experimentally to avoid weird weights
> in frames that are mostly intra. */
> +        if( !found || (minscale == 1 << mindenom && minoff == 0) ||
> (float)minscore / origscore > 0.998f )
> +        {
> +            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
> +            continue;
> +        }
> +        else
> +        {
> +            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
> +        }
> +
> +        //if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE &&
> weights[0].weightfn && !plane )
> +        //    fenc->f_weighted_cost_delta[i_delta_index] =
> (float)minscore / origscore;
> +    }
> +
> +    ///* Optimize and unify denominator */
> +    //if( weights[1].weightfn || weights[2].weightfn )
> +    //{
> +    //    int denom = weights[1].weightfn ? weights[1].i_denom :
> weights[2].i_denom;
> +    //    int both_weighted = weights[1].weightfn && weights[2].weightfn;
> +    //    /* If only one plane is weighted, the other has an implicit
> scale of 1<<denom.
> +    //     * With denom==7, this comes out to 128, which is invalid, so
> don't allow that. */
> +    //    while( (!both_weighted && denom==7) ||
> +    //           (denom > 0 && !(weights[1].weightfn &&
> (weights[1].i_scale&1))
> +    //                     && !(weights[2].weightfn &&
> (weights[2].i_scale&1))) )
> +    //    {
> +    //        denom--;
> +    //        for( int i = 1; i <= 2; i++ )
> +    //            if( weights[i].weightfn )
> +    //            {
> +    //                weights[i].i_scale >>= 1;
> +    //                weights[i].i_denom = denom;
> +    //            }
> +    //    }
> +    //}
> +    //for( int i = 1; i <= 2; i++ )
> +    //    if( weights[i].weightfn )
> +    //        h->mc.weight_cache( h, &weights[i] );
> +
> +    //if( weights[0].weightfn && b_lookahead )
> +    //{
> +    //    //scale lowres in lookahead for slicetype_frame_cost
> +    //    pixel *src = ref->buffer_lowres[0];
> +    //    pixel *dst = h->mb.p_weight_buf[0];
> +    //    int width = ref->i_width_lowres + PADH*2;
> +    //    int height = ref->i_lines_lowres + PADV*2;
> +    //    x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src,
> ref->i_stride_lowres,
> +    //                             width, height, &weights[0] );
> +    //    fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH +
> ref->i_stride_lowres * PADV;
> +    //}
>

remove; we should never have commented code blocks

> +}
> +
>  #define NUM_CUS (widthInCU > 2 && heightInCU > 2 ? (widthInCU - 2) *
> (heightInCU - 2) : widthInCU * heightInCU)
>
>  int Lookahead::estimateFrameCost(int p0, int p1, int b, bool
> bIntraPenalty)
>  {
>      int score = 0;
>      Lowres *fenc = frames[b];
> -
> -    curb = b;
> -    curp0 = p0;
> -    curp1 = p1;
> +    wpScalingParam wp;
> +    wp.bPresentFlag = false;
>
>      if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 -
> b][0] != -1)
>          score = fenc->costEst[b - p0][p1 - b];
> @@ -209,9 +530,23 @@
>          bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x ==
> 0x7FFF;
>          bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x ==
> 0x7FFF;
>
> -        if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
> +        if (bDoSearch[0])
> +        {
> +            if( cfg->param.bEnableWeightedPred && b==p1)
>

white-space, I'll quit commenting on this

> +            {
> +                wp.bPresentFlag = false;
> +                wp.inputWeight = 0;
>

this re-initialization is unnecessary

> +                weightsAnalyse(b, p0, 1, &wp);
> +            }
> +        bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x ==
> 0x7FFF;
> +        bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x ==
> 0x7FFF;
>

these two lines are out-of-place, probably need to be deleted

> +            fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
> +        }
>          if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
>
> +        curb = b;
> +        curp0 = p0;
> +        curp1 = p1;
>

these look unused

>          fenc->costEst[b - p0][p1 - b] = 0;
>          fenc->costEstAq[b - p0][p1 - b] = 0;
>          // TODO: use lowres MVs as motion candidates in full-res search
> @@ -572,6 +907,15 @@
>              brefs++;
>          } */
>
> +        ///* Analyse for weighted P frames */
> +        //if (/*!h->param.rc.b_stat_read &&*/ frames[bframes]->sliceType
> == X265_TYPE_P && cfg->param.bEnableWeightedPred
> +        //    /*&& h->param.analyse.i_weighted_pred >=
> X264_WEIGHTP_SIMPLE*/)
> +        //{
> +        //    x265_emms();
> +        //    //x264_weights_analyse(h, h->lookahead->next.list[bframes],
> h->lookahead->last_nonb, 0);
> +        //    weightsAnalyse(bframes, 0, 1);
> +        //}
> +
>

this will not be called from here, please remove

>          /* calculate the frame costs ahead of time for
> x264_rc_analyse_slice while we still have lowres */
>          if (cfg->param.rc.rateControlMode != X265_RC_CQP)
>          {
> @@ -613,14 +957,6 @@
>              } */
>          }
>
> -        /* Analyse for weighted P frames
> -        if (!h->param.rc.b_stat_read &&
> h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
> -            && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE)
> -        {
> -            x265_emms();
> -            x264_weights_analyse(h, h->lookahead->next.list[bframes],
> h->lookahead->last_nonb, 0);
> -        }*/
> -
>          /* dequeue all frames from inputQueue that are about to be
> enqueued
>           * in the output queue.  The order is important because TComPic
> can
>           * only be in one list at a time */
> diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.h
> --- a/source/encoder/slicetype.h        Tue Nov 12 17:06:03 2013 +0530
> +++ b/source/encoder/slicetype.h        Tue Nov 12 17:07:27 2013 +0530
> @@ -47,11 +47,13 @@
>      int                 costIntra;      // Estimated Intra cost for all
> CUs in a row
>      int                 costIntraAq;    // Estimated weighted Aq Intra
> cost for all CUs in a row
>      int                 intraMbs;       // Number of Intra CUs
> +    TEncCfg             *cfg;
>
>      Lowres** frames;
>      int widthInCU;
>      int heightInCU;
>      int merange;
> +    Lowres *weightedRef;
>
>      LookaheadRow()
>      {
> @@ -82,6 +84,9 @@
>      int              widthInCU;       // width of lowres frame in
> downscale CUs
>      int              heightInCU;      // height of lowres frame in
> downscale CUs
>
> +    Lowres *weightedRef;
> +    int numWRefs;
> +
>      PicList inputQueue;  // input pictures in order received
>      PicList outputQueue; // pictures to be encoded, in encode order
>
> @@ -110,6 +115,11 @@
>      int slicetypePathCost(char *path, int threshold);
>
>      void processRow(int row);
> +
> +    void weightsAnalyse(int b, int p0, int b_lookahead, wpScalingParam
> *w);
> +    unsigned int x265_weight_cost_luma( int b, pixel *src, wpScalingParam
> *w );
> +    pixel* x265_weight_cost_init_luma( int b, int p0, pixel *dest );
> +    int x265_weight_slice_header_cost(wpScalingParam *w, int b_chroma );
>  };
>  }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131112/e48d854e/attachment-0001.html>