[x265] [PATCH 2 of 3] Pulling x264 weight decision into/for x265 lookahead
shazeb at multicorewareinc.com
shazeb at multicorewareinc.com
Tue Nov 12 12:49:44 CET 2013
# HG changeset patch
# User Shazeb Nawaz Khan <shazeb at multicorewareinc.com>
# Date 1384256247 -19800
# Tue Nov 12 17:07:27 2013 +0530
# Node ID 21596a519ba8cc521dbc81f693c867cbca03fd3f
# Parent e7319fd46128b3bfcc826ea9be02896b316ed966
Pulling x264 weight decision into/for x265 lookahead
diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Tue Nov 12 17:06:03 2013 +0530
+++ b/source/encoder/slicetype.cpp Tue Nov 12 17:07:27 2013 +0530
@@ -45,6 +45,14 @@
using namespace x265;
+#define SET_WEIGHT( w, b, s, d, o )\
+{\
+ (w).inputWeight = (s);\
+ (w).log2WeightDenom = (d);\
+ (w).inputOffset = (o);\
+ (w).bPresentFlag = b;\
+}
+
static inline int16_t median(int16_t a, int16_t b, int16_t c)
{
int16_t t = (a - b) & ((a - b) >> 31);
@@ -190,16 +198,329 @@
return pic->m_lowres.satdCost;
}
+/* makes a non-h265 weight (i.e. fix7), into an h265 weight */
+static void x265_weight_get_h265( int weight_nonh264, int offset, wpScalingParam *w )
+{
+ w->inputOffset = offset;
+ w->log2WeightDenom = 7;
+ w->inputWeight = weight_nonh264;
+ while( w->log2WeightDenom > 0 && (w->inputWeight > 127) )
+ {
+ w->log2WeightDenom--;
+ w->inputWeight >>= 1;
+ }
+ w->inputWeight = X265_MIN( w->inputWeight, 127 );
+}
+
+pixel* Lookahead::x265_weight_cost_init_luma( int b, int p0, pixel *dest )
+{
+ Lowres *fenc, *ref;
+ fenc = frames[b];
+ ref = frames[p0];
+ int ref0_distance = b - p0 - 1;
+ /* Note: this will never run during lookahead as weights_analyse is only called if no
+ * motion search has been done. */
+ if( fenc->lowresMvs[0][ref0_distance][0].x != 0x7FFF )
+ {
+ int i_stride = fenc->lumaStride;
+ int i_lines = fenc->lines;
+ int i_width = fenc->width;
+ int i_mb_xy = 0;
+ pixel *p = dest;
+
+ for( int y = 0; y < i_lines; y += 8, p += i_stride*8 )
+ for( int x = 0; x < i_width; x += 8, i_mb_xy++ )
+ {
+ int mvx = fenc->lowresMvs[0][ref0_distance][i_mb_xy].x;
+ int mvy = fenc->lowresMvs[0][ref0_distance][i_mb_xy].y;
+ mvx;mvy;
+ //h->mc.mc_luma( p+x, i_stride, ref->lowresPlane, i_stride,
+ // mvx+(x<<2), mvy+(y<<2), 8, 8, x264_weight_none );
+ }
+ x265_emms();
+ return dest;
+ }
+ x265_emms();
+ return ref->lowresPlane[0];
+}
+
+static void mc_weight( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride,
+ const wpScalingParam *weight, int i_width, int i_height )
+{
+ int offset = weight->inputOffset << (g_bitDepth - 8);
+ int scale = weight->inputWeight;
+ int denom = weight->log2WeightDenom;
+ int corection = (IF_INTERNAL_PREC - X265_DEPTH);
+ if( denom >= 1 )
+ {
+ primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride, i_width, i_height, scale, (1<<(denom - 1 + corection)), (denom + corection), offset);
+ }
+ else
+ {
+ primitives.weightpUniPixel(src, dst, i_src_stride, i_dst_stride, i_width, i_height, scale, 0 + corection, 0 + corection, offset);
+ }
+}
+
+unsigned int Lookahead::x265_weight_cost_luma( int b, pixel *src, wpScalingParam *w )
+{
+ Lowres *fenc = frames[b];
+ unsigned int cost = 0;
+ int i_stride = fenc->lumaStride;
+ int i_lines = fenc->lines;
+ int i_width = fenc->width;
+ pixel *fenc_plane = fenc->lowresPlane[0];
+ ALIGN_VAR_16( pixel, buf[8*8]);
+ int pixoff = 0;
+ int i_mb = 0;
+
+ if( w )
+ {
+ for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8)
+ {
+ // TO DO prepare full weighted plane
+ mc_weight(buf, 8, &src[pixoff], i_stride, w, 8, 8);
+ int cmp = primitives.satd[LUMA_8x8]( buf, 8, &fenc_plane[pixoff], i_stride );
+ cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
+ }
+ }
+ else
+ for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( int x = 0; x < i_width; x += 8, i_mb++, pixoff += 8 )
+ {
+ int cmp = primitives.satd[LUMA_8x8]( &src[pixoff], i_stride, &fenc_plane[pixoff], i_stride );
+ cost += X265_MIN( cmp, fenc->intraCost[i_mb] );
+ }
+ x265_emms();
+ return cost;
+}
+
+void Lookahead::weightsAnalyse(int b, int p0, int b_lookahead, wpScalingParam* w)
+{
+ //int i_delta_index = b - p0 - 1;
+ Lowres *fenc, *ref;
+ fenc = frames[b];
+ ref = frames[p0];
+ /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
+ const float epsilon = 1.f/128.f;
+ wpScalingParam *weights = w;
+ SET_WEIGHT( weights[0], 0, 1, 0, 0 );
+ //SET_WEIGHT( weights[1], 0, 1, 0, 0 );
+ //SET_WEIGHT( weights[2], 0, 1, 0, 0 );
+ float guess_scale[3];
+ float fenc_mean[3];
+ float ref_mean[3];
+ for( int plane = 0; plane <= 2*!b_lookahead; plane++ )
+ {
+ float fenc_var = (float) fenc->wp_ssd[plane] + !ref->wp_ssd[plane];
+ float ref_var = (float) ref->wp_ssd[plane] + !ref->wp_ssd[plane];
+ guess_scale[plane] = sqrtf( fenc_var / ref_var );
+ fenc_mean[plane] = (float)fenc->wp_sum[plane] / ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 << (BIT_DEPTH - 8))*/;
+ ref_mean[plane] = (float) ref->wp_sum[plane] / ((fenc->lines>>(plane?1:0)) * (fenc->width>>(plane?1:0))) /*/ (1 << (BIT_DEPTH - 8))*/;
+ }
+
+ //int chroma_denom = 7;
+ //if( !b_lookahead )
+ //{
+ // /* make sure both our scale factors fit */
+ // while( chroma_denom > 0 )
+ // {
+ // float thresh = 127.f / (1<<chroma_denom);
+ // if( guess_scale[1] < thresh && guess_scale[2] < thresh )
+ // break;
+ // chroma_denom--;
+ // }
+ //}
+
+ /* Don't check chroma in lookahead, or if there wasn't a luma weight. */
+ for( int plane = 0; plane <= 2 && !( plane && ( /*!weights[0].weightfn*/ !fenc->isWeighted || b_lookahead ) ); plane++ )
+ {
+ int minoff, minscale, mindenom;
+ unsigned int minscore = 0, origscore = 1;
+ origscore;
+ int found;
+
+ //early termination
+ if( fabsf( ref_mean[plane] - fenc_mean[plane] ) < 0.5f && fabsf( 1.f - guess_scale[plane] ) < epsilon )
+ {
+ SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+ //printf("\nEarly\n");
+ continue;
+ }
+
+ if( plane )
+ {
+ //weights[plane].i_denom = chroma_denom;
+ //weights[plane].i_scale = x264_clip3( round( guess_scale[plane] * (1<<chroma_denom) ), 0, 255 );
+ //if( weights[plane].i_scale > 127 )
+ //{
+ // weights[1].weightfn = weights[2].weightfn = NULL;
+ // break;
+ //}
+ }
+ else
+ x265_weight_get_h265( (int)( guess_scale[plane] * 128 + 0.5), 0, &weights[plane] );
+
+ found = 0;
+ mindenom = weights[plane].log2WeightDenom;
+ minscale = weights[plane].inputWeight;
+ minoff = 0;
+
+ pixel *mcbuf = NULL;
+ mcbuf;
+ if( !plane )
+ {
+ if( !fenc->bIntraCalculated )
+ {
+ estimateFrameCost(b,b,b,0);
+ }
+ mcbuf = x265_weight_cost_init_luma( b, p0, NULL );
+ origscore = minscore = x265_weight_cost_luma( b, mcbuf, NULL );
+ }
+ else
+ {
+ //if( CHROMA444 )
+ //{
+ // mcbuf = x264_weight_cost_init_chroma444( h, fenc, ref, h->mb.p_weight_buf[0], plane );
+ // origscore = minscore = x264_weight_cost_chroma444( h, fenc, mcbuf, NULL, plane );
+ //}
+ //else
+ //{
+ //pixel *dstu = h->mb.p_weight_buf[0];
+ //pixel *dstv = h->mb.p_weight_buf[0]+fenc->i_stride[1]*fenc->i_lines[1];
+ //if( !chroma_initted++ )
+ // x264_weight_cost_init_chroma( h, fenc, ref, dstu, dstv );
+ //mcbuf = plane == 1 ? dstu : dstv;
+ //origscore = minscore = x264_weight_cost_chroma( h, fenc, mcbuf, NULL );
+ //}
+ }
+
+ if( !minscore )
+ continue;
+
+ /* Picked somewhat arbitrarily */
+ static const uint8_t weight_check_distance[][2] =
+ {
+ {0,0},{0,0},{0,1},{0,1},
+ {0,1},{0,1},{0,1},{1,1},
+ {1,1},{2,1},{2,1},{4,2}
+ };
+ int scale_dist = b_lookahead ? 0 : weight_check_distance[cfg->param.subpelRefine][0];
+ int offset_dist = b_lookahead ? 0 : weight_check_distance[cfg->param.subpelRefine][1];
+
+ int start_scale = Clip3( 0, 127, minscale - scale_dist );
+ int end_scale = Clip3( 0, 127, minscale + scale_dist );
+ unsigned int s=0;
+ for( int i_scale = start_scale; i_scale <= end_scale; i_scale++ )
+ {
+ int cur_scale = i_scale;
+ int cur_offset = (int) (fenc_mean[plane] - ref_mean[plane] * cur_scale / (1 << mindenom) + 0.5f * b_lookahead);
+ if( cur_offset < - 128 || cur_offset > 127 )
+ {
+ /* Rescale considering the constraints on cur_offset. We do it in this order
+ * because scale has a much wider range than offset (because of denom), so
+ * it should almost never need to be clamped. */
+ cur_offset = Clip3( -128, 127, cur_offset );
+ cur_scale = (int) ((1 << mindenom) * (fenc_mean[plane] - cur_offset) / ref_mean[plane] + 0.5f);
+ cur_scale = Clip3( 0, 127, cur_scale );
+ }
+ int start_offset = Clip3( -128, 127, cur_offset - offset_dist );
+ int end_offset = Clip3( -128, 127, cur_offset + offset_dist );
+ for( int i_off = start_offset; i_off <= end_offset; i_off++ )
+ {
+ SET_WEIGHT( weights[plane], 1, cur_scale, mindenom, i_off );
+ //unsigned int s;
+ //if( plane )
+ //{
+ // //if( CHROMA444 )
+ // // s = x264_weight_cost_chroma444( h, fenc, mcbuf, &weights[plane], plane );
+ // //else
+ // s = x264_weight_cost_chroma( h, fenc, mcbuf, &weights[plane] );
+ //}
+ //else
+ {
+ s = x265_weight_cost_luma( b, mcbuf, &weights[plane] );
+ }
+ COPY4_IF_LT( minscore, s, minscale, cur_scale, minoff, i_off, found, 1 );
+
+ // Don't check any more offsets if the previous one had a lower cost than the current one
+ if( minoff == start_offset && i_off != start_offset )
+ break;
+ }
+ }
+ x265_emms();
+
+ /* Use a smaller denominator if possible */
+ if( !plane )
+ {
+ while( mindenom > 0 && !(minscale&1) )
+ {
+ mindenom--;
+ minscale >>= 1;
+ }
+ }
+
+ /* FIXME: More analysis can be done here on SAD vs. SATD termination. */
+ /* 0.2% termination derived experimentally to avoid weird weights in frames that are mostly intra. */
+ if( !found || (minscale == 1 << mindenom && minoff == 0) || (float)minscore / origscore > 0.998f )
+ {
+ SET_WEIGHT( weights[plane], 0, 1, 0, 0 );
+ continue;
+ }
+ else
+ {
+ SET_WEIGHT( weights[plane], 1, minscale, mindenom, minoff );
+ }
+
+ //if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE && weights[0].weightfn && !plane )
+ // fenc->f_weighted_cost_delta[i_delta_index] = (float)minscore / origscore;
+ }
+
+ ///* Optimize and unify denominator */
+ //if( weights[1].weightfn || weights[2].weightfn )
+ //{
+ // int denom = weights[1].weightfn ? weights[1].i_denom : weights[2].i_denom;
+ // int both_weighted = weights[1].weightfn && weights[2].weightfn;
+ // /* If only one plane is weighted, the other has an implicit scale of 1<<denom.
+ // * With denom==7, this comes out to 128, which is invalid, so don't allow that. */
+ // while( (!both_weighted && denom==7) ||
+ // (denom > 0 && !(weights[1].weightfn && (weights[1].i_scale&1))
+ // && !(weights[2].weightfn && (weights[2].i_scale&1))) )
+ // {
+ // denom--;
+ // for( int i = 1; i <= 2; i++ )
+ // if( weights[i].weightfn )
+ // {
+ // weights[i].i_scale >>= 1;
+ // weights[i].i_denom = denom;
+ // }
+ // }
+ //}
+ //for( int i = 1; i <= 2; i++ )
+ // if( weights[i].weightfn )
+ // h->mc.weight_cache( h, &weights[i] );
+
+ //if( weights[0].weightfn && b_lookahead )
+ //{
+ // //scale lowres in lookahead for slicetype_frame_cost
+ // pixel *src = ref->buffer_lowres[0];
+ // pixel *dst = h->mb.p_weight_buf[0];
+ // int width = ref->i_width_lowres + PADH*2;
+ // int height = ref->i_lines_lowres + PADV*2;
+ // x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
+ // width, height, &weights[0] );
+ // fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+ //}
+}
+
#define NUM_CUS (widthInCU > 2 && heightInCU > 2 ? (widthInCU - 2) * (heightInCU - 2) : widthInCU * heightInCU)
int Lookahead::estimateFrameCost(int p0, int p1, int b, bool bIntraPenalty)
{
int score = 0;
Lowres *fenc = frames[b];
-
- curb = b;
- curp0 = p0;
- curp1 = p1;
+ wpScalingParam wp;
+ wp.bPresentFlag = false;
if (fenc->costEst[b - p0][p1 - b] >= 0 && fenc->rowSatds[b - p0][p1 - b][0] != -1)
score = fenc->costEst[b - p0][p1 - b];
@@ -209,9 +530,23 @@
bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
- if (bDoSearch[0]) fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
+ if (bDoSearch[0])
+ {
+ if( cfg->param.bEnableWeightedPred && b==p1)
+ {
+ wp.bPresentFlag = false;
+ wp.inputWeight = 0;
+ weightsAnalyse(b, p0, 1, &wp);
+ }
+ bDoSearch[0] = b != p0 && fenc->lowresMvs[0][b - p0 - 1][0].x == 0x7FFF;
+ bDoSearch[1] = b != p1 && fenc->lowresMvs[1][p1 - b - 1][0].x == 0x7FFF;
+ fenc->lowresMvs[0][b - p0 - 1][0].x = 0;
+ }
if (bDoSearch[1]) fenc->lowresMvs[1][p1 - b - 1][0].x = 0;
+ curb = b;
+ curp0 = p0;
+ curp1 = p1;
fenc->costEst[b - p0][p1 - b] = 0;
fenc->costEstAq[b - p0][p1 - b] = 0;
// TODO: use lowres MVs as motion candidates in full-res search
@@ -572,6 +907,15 @@
brefs++;
} */
+ ///* Analyse for weighted P frames */
+ //if (/*!h->param.rc.b_stat_read &&*/ frames[bframes]->sliceType == X265_TYPE_P && cfg->param.bEnableWeightedPred
+ // /*&& h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE*/)
+ //{
+ // x265_emms();
+ // //x264_weights_analyse(h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0);
+ // weightsAnalyse(bframes, 0, 1);
+ //}
+
/* calculate the frame costs ahead of time for x264_rc_analyse_slice while we still have lowres */
if (cfg->param.rc.rateControlMode != X265_RC_CQP)
{
@@ -613,14 +957,6 @@
} */
}
- /* Analyse for weighted P frames
- if (!h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
- && h->param.analyse.i_weighted_pred >= X264_WEIGHTP_SIMPLE)
- {
- x265_emms();
- x264_weights_analyse(h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0);
- }*/
-
/* dequeue all frames from inputQueue that are about to be enqueued
* in the output queue. The order is important because TComPic can
* only be in one list at a time */
diff -r e7319fd46128 -r 21596a519ba8 source/encoder/slicetype.h
--- a/source/encoder/slicetype.h Tue Nov 12 17:06:03 2013 +0530
+++ b/source/encoder/slicetype.h Tue Nov 12 17:07:27 2013 +0530
@@ -47,11 +47,13 @@
int costIntra; // Estimated Intra cost for all CUs in a row
int costIntraAq; // Estimated weighted Aq Intra cost for all CUs in a row
int intraMbs; // Number of Intra CUs
+ TEncCfg *cfg;
Lowres** frames;
int widthInCU;
int heightInCU;
int merange;
+ Lowres *weightedRef;
LookaheadRow()
{
@@ -82,6 +84,9 @@
int widthInCU; // width of lowres frame in downscale CUs
int heightInCU; // height of lowres frame in downscale CUs
+ Lowres *weightedRef;
+ int numWRefs;
+
PicList inputQueue; // input pictures in order received
PicList outputQueue; // pictures to be encoded, in encode order
@@ -110,6 +115,11 @@
int slicetypePathCost(char *path, int threshold);
void processRow(int row);
+
+ void weightsAnalyse(int b, int p0, int b_lookahead, wpScalingParam *w);
+ unsigned int x265_weight_cost_luma( int b, pixel *src, wpScalingParam *w );
+ pixel* x265_weight_cost_init_luma( int b, int p0, pixel *dest );
+ int x265_weight_slice_header_cost(wpScalingParam *w, int b_chroma );
};
}
More information about the x265-devel
mailing list