[x264-devel] commit: Much faster weightp (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Nov 27 06:22:46 CET 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Nov 25 10:40:08 2009 -0800| [d487de421807a0d384b29a0d9dc6c90f180f0e7b] | committer: Jason Garrett-Glaser
Much faster weightp
Move sum/ssd calculation out of lookahead and do it only once per frame.
Also various minor optimizations, cosmetics, and cleanups.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d487de421807a0d384b29a0d9dc6c90f180f0e7b
---
common/frame.h | 2 +
encoder/analyse.h | 2 +-
encoder/encoder.c | 4 +++
encoder/slicetype.c | 65 +++++++++++++++++++++-----------------------------
4 files changed, 34 insertions(+), 39 deletions(-)
diff --git a/common/frame.h b/common/frame.h
index 6d7cfe9..e255bb6 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -102,6 +102,8 @@ typedef struct x264_frame
uint16_t *i_inv_qscale_factor;
int b_scenecut; /* Set to zero if the frame cannot possibly be part of a real scenecut. */
float f_weighted_cost_delta[X264_BFRAME_MAX+2];
+ uint32_t i_pixel_sum;
+ uint64_t i_pixel_ssd;
/* vbv */
uint8_t i_planned_type[X264_LOOKAHEAD_MAX+1];
diff --git a/encoder/analyse.h b/encoder/analyse.h
index 8fc2738..0e9ad7d 100644
--- a/encoder/analyse.h
+++ b/encoder/analyse.h
@@ -32,7 +32,7 @@ void x264_slicetype_decide( x264_t *h );
void x264_slicetype_analyse( x264_t *h, int keyframe );
int x264_weighted_reference_duplicate( x264_t *h, int i_ref, const x264_weight_t *w );
-void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame );
int x264_lookahead_init( x264_t *h, int i_slicetype_length );
int x264_lookahead_is_empty( x264_t *h );
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 06e08fa..fdbaedc 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1935,7 +1935,11 @@ int x264_encoder_encode( x264_t *h,
fenc->i_frame = h->frames.i_input++;
if( h->frames.b_have_lowres )
+ {
+ if( h->param.analyse.i_weighted_pred )
+ x264_weight_plane_analyse( h, fenc );
x264_frame_init_lowres( h, fenc );
+ }
if( h->param.rc.b_mb_tree && h->param.rc.b_stat_read )
{
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 64c47cb..277da79 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -52,7 +52,7 @@ static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
}
/* makes a non-h264 weight (i.e. fix7), into an h264 weight */
-static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weight_t *w )
+static void x264_weight_get_h264( unsigned int weight_nonh264, int offset, x264_weight_t *w )
{
w->i_offset = offset;
w->i_denom = 7;
@@ -65,31 +65,24 @@ static void get_h264_weight( unsigned int weight_nonh264, int offset, x264_weigh
w->i_scale = X264_MIN( w->i_scale, 127 );
}
-static NOINLINE void weights_plane_analyse( x264_t *h, uint8_t *plane, int width, int height, int stride, unsigned int *sum, uint64_t *var )
+void x264_weight_plane_analyse( x264_t *h, x264_frame_t *frame )
{
int x,y;
- uint64_t sad = 0;
+ uint32_t sad = 0;
uint64_t ssd = 0;
- uint8_t *p = plane;
+ uint8_t *p = frame->plane[0];
+ int stride = frame->i_stride[0];
+ int width = frame->i_width[0];
+ int height = frame->i_lines[0];
for( y = 0; y < height>>4; y++, p += stride*16 )
- for( x = 0; x < width; x+=16 )
+ for( x = 0; x < width; x += 16 )
{
uint64_t res = h->pixf.var[PIXEL_16x16]( p + x, stride );
sad += (uint32_t)res;
ssd += res >> 32;
}
-
- *sum = sad;
- *var = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
- x264_emms();
-}
-
-#define LOAD_HPELS_LUMA(dst, src) \
-{ \
- (dst)[0] = &(src)[0][i_pel_offset]; \
- (dst)[1] = &(src)[1][i_pel_offset]; \
- (dst)[2] = &(src)[2][i_pel_offset]; \
- (dst)[3] = &(src)[3][i_pel_offset]; \
+ frame->i_pixel_sum = sad;
+ frame->i_pixel_ssd = ssd - ((uint64_t)sad * sad + width * height / 2) / (width * height);
}
static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, uint8_t *dest )
@@ -99,23 +92,20 @@ static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fe
* motion search has been done. */
if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
{
- uint8_t *src[4];
int i_stride = fenc->i_stride_lowres;
int i_lines = fenc->i_lines_lowres;
int i_width = fenc->i_width_lowres;
int i_mb_xy = 0;
int x,y;
- int i_pel_offset = 0;
+ uint8_t *p = dest;
- for( y = 0; y < i_lines; y += 8, i_pel_offset = y*i_stride )
- for( x = 0; x < i_width; x += 8, i_mb_xy++, i_pel_offset += 8 )
+ for( y = 0; y < i_lines; y += 8, p += i_stride*8 )
+ for( x = 0; x < i_width; x += 8, i_mb_xy++ )
{
- uint8_t *pix = &dest[ i_pel_offset ];
int mvx = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][0];
int mvy = fenc->lowres_mvs[0][ref0_distance][i_mb_xy][1];
- LOAD_HPELS_LUMA( src, ref->lowres );
- h->mc.mc_luma( pix, i_stride, src, i_stride,
- mvx, mvy, 8, 8, weight_none );
+ h->mc.mc_luma( p+x, i_stride, ref->lowres, i_stride,
+ mvx+(x<<2), mvy+(y<<2), 8, 8, weight_none );
}
x264_emms();
return dest;
@@ -123,7 +113,6 @@ static NOINLINE uint8_t *x264_weight_cost_init_luma( x264_t *h, x264_frame_t *fe
x264_emms();
return ref->lowres[0];
}
-#undef LOAD_HPELS_LUMA
static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, uint8_t *src, x264_weight_t *w )
{
@@ -168,9 +157,7 @@ static NOINLINE unsigned int x264_weight_cost( x264_t *h, x264_frame_t *fenc, ui
void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
{
- unsigned int fenc_sum, ref_sum;
- float fenc_mean, ref_mean;
- uint64_t fenc_var, ref_var;
+ float fenc_mean, ref_mean, fenc_var, ref_var;
int i_off, offset_search;
int minoff, minscale, mindenom;
unsigned int minscore, origscore;
@@ -181,22 +168,20 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
int found;
x264_weight_t *weights = fenc->weight[0];
- weights_plane_analyse( h, fenc->plane[0], fenc->i_width[0], fenc->i_lines[0], fenc->i_stride[0], &fenc_sum, &fenc_var );
- weights_plane_analyse( h, ref->plane[0], ref->i_width[0], ref->i_lines[0], ref->i_stride[0], &ref_sum, &ref_var );
- fenc_var = round( sqrt( fenc_var ) );
- ref_var = round( sqrt( ref_var ) );
- fenc_mean = (float)fenc_sum / (fenc->i_lines[0] * fenc->i_width[0]);
- ref_mean = (float) ref_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+ fenc_var = round( sqrt( fenc->i_pixel_ssd ) );
+ ref_var = round( sqrt( ref->i_pixel_ssd ) );
+ fenc_mean = (float)fenc->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
+ ref_mean = (float) ref->i_pixel_sum / (fenc->i_lines[0] * fenc->i_width[0]);
//early termination
- if( fabs( ref_mean - fenc_mean ) < 0.5 && fabsf( 1 - (float)fenc_var / ref_var ) < epsilon )
+ if( fabs( ref_mean - fenc_mean ) < 0.5 && fabs( 1 - fenc_var / ref_var ) < epsilon )
{
SET_WEIGHT( weights[0], 0, 1, 0, 0 );
return;
}
- guess_scale = ref_var ? (float)fenc_var/ref_var : 0;
- get_h264_weight( round( guess_scale * 128 ), 0, &weights[0] );
+ guess_scale = ref_var ? fenc_var/ref_var : 0;
+ x264_weight_get_h264( round( guess_scale * 128 ), 0, &weights[0] );
found = 0;
mindenom = weights[0].i_denom;
@@ -527,6 +512,7 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
if( ( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART
|| h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE ) && b == p1 )
{
+ x264_emms();
x264_weights_analyse( h, frames[b], frames[p0], 1 );
w = frames[b]->weight[0];
}
@@ -1276,7 +1262,10 @@ void x264_slicetype_decide( x264_t *h )
/* Analyse for weighted P frames */
if( !h->param.rc.b_stat_read && h->lookahead->next.list[bframes]->i_type == X264_TYPE_P
&& h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
+ {
+ x264_emms();
x264_weights_analyse( h, h->lookahead->next.list[bframes], h->lookahead->last_nonb, 0 );
+ }
/* shift sequence to coded order.
use a small temporary list to avoid shifting the entire next buffer around */
More information about the x264-devel
mailing list