[x264-devel] commit: Prevent some cases of cache aliasing. (Loren Merritt )
git at videolan.org
git at videolan.org
Wed Jul 21 20:26:53 CEST 2010
x264 | branch: stable | Loren Merritt <pengvado at akuvian.org> | Thu Jul 15 23:49:03 2010 -0700| [87c0f8fa66c1d7fb7f25e1991341fd7fe78b1bcd] | committer: Jason Garrett-Glaser
Prevent some cases of cache aliasing.
Avoid cases where image strides were a large power of 2.
Core 2: +3% speed at widths 898..960, +6% at widths 1922..1984, most other resolutions unaffected.
Nehalem and AMD: similar amount of speedup, but fewer resolutions affected.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=87c0f8fa66c1d7fb7f25e1991341fd7fe78b1bcd
---
common/frame.c | 35 +++++++++++++++++++++++++----------
common/macroblock.c | 9 +++------
encoder/encoder.c | 6 +++---
3 files changed, 31 insertions(+), 19 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index b144e48..03182a5 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -24,6 +24,21 @@
#include "common.h"
+static int align_stride( int x, int align, int disalign )
+{
+ x = ALIGN( x, align );
+ if( !(x&(disalign-1)) )
+ x += align;
+ return x;
+}
+
+static int align_plane_size( int x, int disalign )
+{
+ if( !(x&(disalign-1)) )
+ x += 128;
+ return x;
+}
+
x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
{
x264_frame_t *frame;
@@ -31,25 +46,29 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
int i_mb_count = h->mb.i_mb_count;
int i_stride, i_width, i_lines;
int i_padv = PADV << h->param.b_interlaced;
- int luma_plane_size;
- int chroma_plane_size;
+ int luma_plane_size, chroma_plane_size;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+ int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
/* allocate frame data (+64 for extra data for me) */
i_width = h->mb.i_mb_width*16;
- i_stride = ALIGN( i_width + 2*PADH, align );
i_lines = h->mb.i_mb_height*16;
+ i_stride = align_stride( i_width + 2*PADH, align, disalign );
frame->i_plane = 2;
for( int i = 0; i < 2; i++ )
{
- frame->i_stride[i] = ALIGN( i_stride, align );
frame->i_width[i] = i_width >> i;
frame->i_lines[i] = i_lines >> i;
+ frame->i_stride[i] = i_stride;
}
+ frame->i_width_lowres = frame->i_width[0]/2;
+ frame->i_lines_lowres = frame->i_lines[0]/2;
+ frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
+
for( int i = 0; i < h->param.i_bframe + 2; i++ )
for( int j = 0; j < h->param.i_bframe + 2; j++ )
CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
@@ -73,7 +92,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
frame->orig = frame;
- luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
+ luma_plane_size = align_plane_size( frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv), disalign );
chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
@@ -128,11 +147,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
{
if( h->frames.b_have_lowres )
{
- frame->i_width_lowres = frame->i_width[0]/2;
- frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
- frame->i_lines_lowres = frame->i_lines[0]/2;
-
- luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
+ luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
for( int i = 0; i < 4; i++ )
diff --git a/common/macroblock.c b/common/macroblock.c
index 92c16c5..5899b15 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -250,8 +250,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
if( h->param.analyse.i_weighted_pred )
{
int i_padv = PADV << h->param.b_interlaced;
- int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
- int i_stride, luma_plane_size = 0;
+ int luma_plane_size = 0;
int numweightbuf;
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
@@ -260,8 +259,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
{
// Fake analysis only works on lowres
- i_stride = ALIGN( h->mb.i_mb_width*8 + 2*PADH, align );
- luma_plane_size = i_stride * (h->mb.i_mb_height*8+2*i_padv);
+ luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv);
// Only need 1 buffer for analysis
numweightbuf = 1;
}
@@ -270,8 +268,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
}
else
{
- i_stride = ALIGN( h->mb.i_mb_width*16 + 2*PADH, align );
- luma_plane_size = i_stride * (h->mb.i_mb_height*16+2*i_padv);
+ luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
//SMART can weight one ref and one offset -1
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 641cf89..701a590 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1096,9 +1096,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
- if( x264_lookahead_init( h, i_slicetype_length ) )
- goto fail;
-
for( int i = 0; i < h->param.i_threads; i++ )
{
int init_nal_count = h->param.i_slice_count + 3;
@@ -1124,6 +1121,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
goto fail;
}
+ if( x264_lookahead_init( h, i_slicetype_length ) )
+ goto fail;
+
for( int i = 0; i < h->param.i_threads; i++ )
if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
goto fail;
More information about the x264-devel
mailing list