[x264-devel] commit: Prevent some cases of cache aliasing. (Loren Merritt )

Wed Jul 21 20:26:53 CEST 2010

x264 | branch: stable | Loren Merritt <pengvado at akuvian.org> | Thu Jul 15 23:49:03 2010 -0700| [87c0f8fa66c1d7fb7f25e1991341fd7fe78b1bcd] | committer: Jason Garrett-Glaser 

Prevent some cases of cache aliasing.
Avoid cases where image strides were a large power of 2.
Core 2: +3% speed at widths 898..960, +6% at widths 1922..1984, most other resolutions unaffected.
Nehalem and AMD: similar amount of speedup, but fewer resolutions affected.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=87c0f8fa66c1d7fb7f25e1991341fd7fe78b1bcd
---

 common/frame.c      |   35 +++++++++++++++++++++++++----------
 common/macroblock.c |    9 +++------
 encoder/encoder.c   |    6 +++---
 3 files changed, 31 insertions(+), 19 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index b144e48..03182a5 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -24,6 +24,21 @@
 
 #include "common.h"
 
+static int align_stride( int x, int align, int disalign )
+{
+    x = ALIGN( x, align );
+    if( !(x&(disalign-1)) )
+        x += align;
+    return x;
+}
+
+static int align_plane_size( int x, int disalign )
+{
+    if( !(x&(disalign-1)) )
+        x += 128;
+    return x;
+}
+
 x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 {
     x264_frame_t *frame;
@@ -31,25 +46,29 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     int i_mb_count = h->mb.i_mb_count;
     int i_stride, i_width, i_lines;
     int i_padv = PADV << h->param.b_interlaced;
-    int luma_plane_size;
-    int chroma_plane_size;
+    int luma_plane_size, chroma_plane_size;
     int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
+    int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
 
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
 
     /* allocate frame data (+64 for extra data for me) */
     i_width  = h->mb.i_mb_width*16;
-    i_stride = ALIGN( i_width + 2*PADH, align );
     i_lines  = h->mb.i_mb_height*16;
+    i_stride = align_stride( i_width + 2*PADH, align, disalign );
 
     frame->i_plane = 2;
     for( int i = 0; i < 2; i++ )
     {
-        frame->i_stride[i] = ALIGN( i_stride, align );
         frame->i_width[i] = i_width >> i;
         frame->i_lines[i] = i_lines >> i;
+        frame->i_stride[i] = i_stride;
     }
 
+    frame->i_width_lowres = frame->i_width[0]/2;
+    frame->i_lines_lowres = frame->i_lines[0]/2;
+    frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
+
     for( int i = 0; i < h->param.i_bframe + 2; i++ )
         for( int j = 0; j < h->param.i_bframe + 2; j++ )
             CHECKED_MALLOC( frame->i_row_satds[i][j], i_lines/16 * sizeof(int) );
@@ -73,7 +92,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
 
     frame->orig = frame;
 
-    luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
+    luma_plane_size = align_plane_size( frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv), disalign );
     chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
 
     CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
@@ -128,11 +147,7 @@ x264_frame_t *x264_frame_new( x264_t *h, int b_fdec )
     {
         if( h->frames.b_have_lowres )
         {
-            frame->i_width_lowres = frame->i_width[0]/2;
-            frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
-            frame->i_lines_lowres = frame->i_lines[0]/2;
-
-            luma_plane_size = frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV);
+            luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
 
             CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size * sizeof(pixel) );
             for( int i = 0; i < 4; i++ )
diff --git a/common/macroblock.c b/common/macroblock.c
index 92c16c5..5899b15 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -250,8 +250,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
     if( h->param.analyse.i_weighted_pred )
     {
         int i_padv = PADV << h->param.b_interlaced;
-        int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
-        int i_stride, luma_plane_size = 0;
+        int luma_plane_size = 0;
         int numweightbuf;
 
         if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_FAKE )
@@ -260,8 +259,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
             if( !h->param.i_sync_lookahead || h == h->thread[h->param.i_threads] )
             {
                 // Fake analysis only works on lowres
-                i_stride = ALIGN( h->mb.i_mb_width*8 + 2*PADH, align );
-                luma_plane_size = i_stride * (h->mb.i_mb_height*8+2*i_padv);
+                luma_plane_size = h->fdec->i_stride_lowres * (h->mb.i_mb_height*8+2*i_padv);
                 // Only need 1 buffer for analysis
                 numweightbuf = 1;
             }
@@ -270,8 +268,7 @@ int x264_macroblock_cache_allocate( x264_t *h )
         }
         else
         {
-            i_stride = ALIGN( h->mb.i_mb_width*16 + 2*PADH, align );
-            luma_plane_size = i_stride * (h->mb.i_mb_height*16+2*i_padv);
+            luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
 
             if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
                 //SMART can weight one ref and one offset -1
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 641cf89..701a590 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -1096,9 +1096,6 @@ x264_t *x264_encoder_open( x264_param_t *param )
     for( int i = 1; i < h->param.i_threads + !!h->param.i_sync_lookahead; i++ )
         CHECKED_MALLOC( h->thread[i], sizeof(x264_t) );
 
-    if( x264_lookahead_init( h, i_slicetype_length ) )
-        goto fail;
-
     for( int i = 0; i < h->param.i_threads; i++ )
     {
         int init_nal_count = h->param.i_slice_count + 3;
@@ -1124,6 +1121,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
             goto fail;
     }
 
+    if( x264_lookahead_init( h, i_slicetype_length ) )
+        goto fail;
+
     for( int i = 0; i < h->param.i_threads; i++ )
         if( x264_macroblock_thread_allocate( h->thread[i], 0 ) < 0 )
             goto fail;