[x265] [PATCH 2 of 3] Pulling x264 weight decision into/for x265 lookahead

shazeb at multicorewareinc.com shazeb at multicorewareinc.com
Tue Nov 12 12:49:44 CET 2013


# HG changeset patch
# User Shazeb Nawaz Khan <shazeb at multicorewareinc.com>
# Date 1384256247 -19800
#      Tue Nov 12 17:07:27 2013 +0530
# Node ID 21596a519ba8cc521dbc81f693c867cbca03fd3f
# Parent  e7319fd46128b3bfcc826ea9be02896b316ed966
Pulling x264 weight decision into/for x265 lookahead

diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Tue Nov 12 17:06:03 2013 +0530
+++ b/source/encoder/slicetype.cpp	Tue Nov 12 17:07:27 2013 +0530
@@ -45,6 +45,14 @@
 
 using namespace x265;
 
+#define SET_WEIGHT( w, b, s, d, o )\
+{\
+    (w).inputWeight = (s);\
+    (w).log2WeightDenom = (d);\
+    (w).inputOffset = (o);\
+    (w).bPresentFlag = b;\
+}
+
 static inline int16_t median(int16_t a, int16_t b, int16_t c)
 {
     int16_t t = (a - b) & ((a - b) >> 31);
@@ -190,16 +198,329 @@
     return pic->m_lowres.satdCost;
 }
 
+/* makes a non-h265 weight (i.e. fix7), into an h265 weight */
+static void x265_weight_get_h265( int weight_nonh264, int offset, wpScalingParam *w )
+{
+    w->inputOffset = offset;
+    w->log2WeightDenom = 7;
+    w->inputWeight = weight_nonh264;
+    while( w->log2WeightDenom > 0 && (w->inputWeight > 127) )
+    {
+        w->log2WeightDenom--;
+        w->inputWeight >>= 1;
+    }
+    w->inputWeight = X265_MIN( w->inputWeight, 127 );
+}
+
+pixel* Lookahead::x265_weight_cost_init_luma( int b, int p0, pixel *dest )
+{
+    Lowres *fenc, *ref;
+    fenc = frames[b];
+    ref  = frames[p0];
+    int ref0_distance = b - p0 - 1;
+    /* Note: this will never run during lookahead as weights_analyse is only called if no
+     * motion search has been done. */
+    if( fenc->lowresMvs[0][ref0_distance][0].x != 0x7FFF )
+    {
+        int i_stride = fenc->lumaStride;
+        int i_lines = fenc->lines;
+        int i_width = fenc->width;
+        int i_mb_xy = 0;
+        pixel *p = dest;
+
+        for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
+            for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
+            {
+                int mvx = fenc->lowresMvs[0][ref0_distance][i_mb_xy].x;
+                int mvy = fenc->lowresMvs[0][ref0_distance][i_mb_xy].y;
+                mvx;mvy;
+                //h->mc.mc_luma( p+x, i_stride, ref->lowresPlane, i_stride,
+                //               mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none );
+            }
+        x265_emms();
+        return dest;
+    }
+    x265_emms();
+    return ref->lowresPlane[0];
+}
+
+static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+                       const wpScalingParam *weight, int i_width, int i_height )
+{
+    int offset = weight->inputOffset << (g_bitDepth - 8);
+    int scale = weight->inputWeight;
+    int denom = weight->log2WeightDenom;
+    int corection = (IF_INTERNAL_PREC - X265_DEPTH);
+    if( denom >= 1 )
+    {
+        primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride, i_width, i_height, scale, (1<<(denom - 1 + corection)), (denom + corection), offset);
+    }
+    else
+    {
+        primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride, i_width, i_height, scale, 0 + corection, 0 + corection, offset);
+    }
+}
+
+unsigned int Lookahead::x265_weight_cost_luma( int b, pixel *src, wpScalingParam *w )
+{
+    Lowres *fenc = frames[b];
+    unsigned int cost = 0;
+    int i_stride = fenc->lumaStride;
+    int i_lines = fenc->lines;
+    int i_width = fenc->width;
+    pixel *fenc_plane = fenc->lowresPlane[0];
+    ALIGN_VAR_16( pixel, buf[8*8]);
+    int pixoff = 0;
+    int i_mb = 0;
+
+    if( w )
+    {
+        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
+            {
+                // TO DO prepare full weighted plane
+                mc_weight(buf, 8, &src[pixoff], i_stride, w, 8, 8);
+                int cmp = primitives.satd[LUMA_8x8]( buf, 8, &fenc_plane[pixoff], i_stride );
+                cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
+            }
+    }
+    else
+        for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+            for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+            {
+                int cmp = primitives.satd[LUMA_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride );
+                cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
+            }
+    x265_emms();
+    return cost;
+}
+
+void Lookahead::weightsAnalyse(int b, int p0, int b_lookahead, wpScalingParam* w)
+{
+    //int i_delta_index = b - p0 - 1;
+    Lowres *fenc, *ref;
+    fenc = frames[b];
+    ref  = frames[p0];
+    /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+    const float epsilon = 1.f/128.f;
+    wpScalingParam *weights = w;
+    SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+    //SET_WEIGHT( weights[1], 0, 1, 0, 0 );
+    //SET_WEIGHT( weights[2], 0, 1, 0, 0 );
+    float guess_scale[3];
+    float fenc_mean[3];
+    float ref_mean[3];
+    for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
+    {
+        float fenc_var = (float) fenc->wp_ssd[plane] + !ref->wp_ssd[plane];
+        float ref_var  = (float)  ref->wp_ssd[plane] + !ref->wp_ssd[plane];
+        guess_scale[plane] = sqrtf( fenc_var / ref_var );
+        fenc_mean[plane] = (float)fenc->wp_sum[plane] / ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 << (BIT_DEPTH - 8))*/;
+        ref_mean[plane]  = (float) ref->wp_sum[plane] / ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 << (BIT_DEPTH - 8))*/;
+    }
+
+    //int chroma_denom = 7;
+    //if( !b_lookahead )
+    //{
+    //    /* make sure both our scale factors fit */
+    //    while( chroma_denom > 0 )
+    //    {
+    //        float thresh = 127.f / (1<<chroma_denom);
+    //        if( guess_scale[1] < thresh && guess_scale[2] < thresh )
+    //            break;
+    //        chroma_denom--;
+    //    }
+    //}
+
+    /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
+    for( int plane = 0; plane <= 2 && !( plane && ( /*!weights[0].weightfn*/ !fenc->isWeighted || b_lookahead ) ); plane++ )
+    {
+        int minoff, minscale, mindenom;
+        unsigned int minscore = 0, origscore = 1;
+        origscore;
+        int found;
+
+        //early termination
+        if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
+        {
+            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+            //printf("\nEarly\n");
+            continue;
+        }
+
+        if( plane )
+        {
+            //weights[plane].i_denom = chroma_denom;
+            //weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
+            //if( weights[plane].i_scale > 127 )
+            //{
+            //    weights[1].weightfn = weights[2].weightfn = NULL;
+            //    break;
+            //}
+        }
+        else
+            x265_weight_get_h265( (int)( guess_scale[plane] * 128 + 0.5), 0, &weights[plane] );
+        
+        found = 0;
+        mindenom = weights[plane].log2WeightDenom;
+        minscale = weights[plane].inputWeight;
+        minoff = 0;
+
+        pixel *mcbuf = NULL;
+        mcbuf;
+        if( !plane )
+        {
+            if( !fenc->bIntraCalculated )
+            {
+                estimateFrameCost(b,b,b,0);
+            }
+            mcbuf = x265_weight_cost_init_luma( b, p0, NULL );
+            origscore = minscore = x265_weight_cost_luma( b, mcbuf, NULL );
+        }
+        else
+        {
+            //if( CHROMA444 )
+            //{
+            //    mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
+            //    origscore = minscore = x264_weight_cost_chroma444( h, fenc, mcbuf, NULL, plane );
+            //}
+            //else
+            //{
+                //pixel *dstu = h->mb.p_weight_buf[0];
+                //pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
+                //if( !chroma_initted++ )
+                //    x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
+                //mcbuf = plane == 1 ? dstu : dstv;
+                //origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
+            //}
+        }
+
+        if( !minscore )
+            continue;
+
+        /* Picked somewhat arbitrarily */
+        static const uint8_t weight_check_distance[][2] =
+        {
+            {0,0},{0,0},{0,1},{0,1},
+            {0,1},{0,1},{0,1},{1,1},
+            {1,1},{2,1},{2,1},{4,2}
+        };
+        int scale_dist =  b_lookahead ? 0 : weight_check_distance[cfg->param.subpelRefine][0];
+        int offset_dist = b_lookahead ? 0 : weight_check_distance[cfg->param.subpelRefine][1];
+        
+        int start_scale  = Clip3( 0, 127, minscale - scale_dist );
+        int end_scale    = Clip3( 0, 127, minscale + scale_dist );
+        unsigned int s=0;
+        for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
+        {
+            int cur_scale = i_scale;
+            int cur_offset = (int) (fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead);
+            if( cur_offset < - 128 || cur_offset > 127 )
+            {
+                /* Rescale considering the constraints on cur_offset. We do it in this order
+                 * because scale has a much wider range than offset (because of denom), so
+                 * it should almost never need to be clamped. */
+                cur_offset = Clip3( -128, 127, cur_offset );
+                cur_scale = (int) ((1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f);
+                cur_scale = Clip3( 0, 127, cur_scale );
+            }
+            int start_offset = Clip3( -128, 127, cur_offset - offset_dist );
+            int end_offset   = Clip3( -128, 127, cur_offset + offset_dist );
+            for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+            {
+                SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
+                //unsigned int s;
+                //if( plane )
+                //{
+                //    //if( CHROMA444 )
+                //    //    s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
+                //    //else
+                //        s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+                //}
+                //else
+                {
+                    s = x265_weight_cost_luma( b, mcbuf, &weights[plane] );
+                }
+                COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
+
+                // Don't check any more offsets if the previous one had a lower cost than the current one
+                if( minoff == start_offset && i_off != start_offset )
+                    break;
+            }
+        }
+        x265_emms();
+
+        /* Use a smaller denominator if possible */
+        if( !plane )
+        {
+            while( mindenom > 0 && !(minscale&1) )
+            {
+                mindenom--;
+                minscale >>= 1;
+            }
+        }
+
+        /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+        /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+        if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
+        {
+            SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+            continue;
+        }
+        else
+        {
+            SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
+        }
+
+        //if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
+        //    fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
+    }
+
+    ///* Optimize and unify denominator */
+    //if( weights[1].weightfn || weights[2].weightfn )
+    //{
+    //    int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom;
+    //    int both_weighted = weights[1].weightfn && weights[2].weightfn;
+    //    /* If only one plane is weighted, the other has an implicit scale of 1<<denom.
+    //     * With denom==7, this comes out to 128, which is invalid, so don't allow that. */
+    //    while( (!both_weighted && denom==7) ||
+    //           (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1))
+    //                     && !(weights[2].weightfn && (weights[2].i_scale&1))) )
+    //    {
+    //        denom--;
+    //        for( int i = 1; i <= 2; i++ )
+    //            if( weights[i].weightfn )
+    //            {
+    //                weights[i].i_scale >>= 1;
+    //                weights[i].i_denom = denom;
+    //            }
+    //    }
+    //}
+    //for( int i = 1; i <= 2; i++ )
+    //    if( weights[i].weightfn )
+    //        h->mc.weight_cache( h, &weights[i] );
+
+    //if( weights[0].weightfn && b_lookahead )
+    //{
+    //    //scale lowres in lookahead for slicetype_frame_cost
+    //    pixel *src = ref->buffer_lowres[0];
+    //    pixel *dst = h->mb.p_weight_buf[0];
+    //    int width = ref->i_width_lowres + PADH*2;
+    //    int height = ref->i_lines_lowres + PADV*2;
+    //    x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
+    //                             width, height, &weights[0] );
+    //    fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+    //}
+}
+
 #define NUM_CUS (widthInCU > 2 && heightInCU > 2 ? (widthInCU - 2) * (heightInCU - 2) : widthInCU * heightInCU)
 
 int Lookahead::estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty)
 {
     int score = 0;
     Lowres *fenc = frames[b];
-
-    curb = b;
-    curp0 = p0;
-    curp1 = p1;
+    wpScalingParam wp;
+    wp.bPresentFlag = false;
 
     if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
         score = fenc->costEst[b - p0][p1 - b];
@@ -209,9 +530,23 @@
         bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
         bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
 
-        if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
+        if (bDoSearch[0])
+        {
+            if( cfg->param.bEnableWeightedPred && b==p1)
+            {
+                wp.bPresentFlag = false;
+                wp.inputWeight = 0;
+                weightsAnalyse(b, p0, 1, &wp);
+            }
+        bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
+        bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
+            fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
+        }
         if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
 
+        curb = b;
+        curp0 = p0;
+        curp1 = p1;
         fenc->costEst[b - p0][p1 - b] = 0;
         fenc->costEstAq[b - p0][p1 - b] = 0;
         // TODO: use lowres MVs as motion candidates in full-res search
@@ -572,6 +907,15 @@
             brefs++;
         } */
 
+        ///* Analyse for weighted P frames */
+        //if (/*!h->param.rc.b_stat_read &&*/ frames[bframes]->sliceType == X265_TYPE_P && cfg->param.bEnableWeightedPred
+        //    /*&& h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE*/)
+        //{
+        //    x265_emms();
+        //    //x264_weights_analyse(h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0);
+        //    weightsAnalyse(bframes, 0, 1);
+        //}
+
         /* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
         if (cfg->param.rc.rateControlMode != X265_RC_CQP)
         {
@@ -613,14 +957,6 @@
             } */
         }
 
-        /* Analyse for weighted P frames
-        if (!h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
-            && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE)
-        {
-            x265_emms();
-            x264_weights_analyse(h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0);
-        }*/
-
         /* dequeue all frames from inputQueue that are about to be enqueued
          * in the output queue.  The order is important because TComPic can
          * only be in one list at a time */
diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h	Tue Nov 12 17:06:03 2013 +0530
+++ b/source/encoder/slicetype.h	Tue Nov 12 17:07:27 2013 +0530
@@ -47,11 +47,13 @@
     int                 costIntra;      // Estimated Intra cost for all CUs in a row
     int                 costIntraAq;    // Estimated weighted Aq Intra cost for all CUs in a row
     int                 intraMbs;       // Number of Intra CUs
+    TEncCfg             *cfg;
 
     Lowres** frames;
     int widthInCU;
     int heightInCU;
     int merange;
+    Lowres *weightedRef;
 
     LookaheadRow()
     {
@@ -82,6 +84,9 @@
     int              widthInCU;       // width of lowres frame in downscale CUs
     int              heightInCU;      // height of lowres frame in downscale CUs
 
+    Lowres *weightedRef;
+    int numWRefs;
+
     PicList inputQueue;  // input pictures in order received
     PicList outputQueue; // pictures to be encoded, in encode order
 
@@ -110,6 +115,11 @@
     int slicetypePathCost(char *path, int threshold);
 
     void processRow(int row);
+
+    void weightsAnalyse(int b, int p0, int b_lookahead, wpScalingParam *w);
+    unsigned int x265_weight_cost_luma( int b, pixel *src, wpScalingParam *w );
+    pixel* x265_weight_cost_init_luma( int b, int p0, pixel *dest );
+    int x265_weight_slice_header_cost(wpScalingParam *w, int b_chroma );
 };
 }
 


More information about the x265-devel mailing list