[x264-devel] commit: Align lowres planes for improved cacheline split performance ( Jason Garrett-Glaser )
git version control
git at videolan.org
Thu Jul 24 15:58:45 CEST 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Jul 17 07:55:24 2008 -0600| [aa0e6277769b3fedb77d1cb8efa43f9c957646c0]
Align lowres planes for improved cacheline split performance
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=aa0e6277769b3fedb77d1cb8efa43f9c957646c0
---
common/frame.c | 33 ++++++++++++++-------------------
common/frame.h | 4 ++--
2 files changed, 16 insertions(+), 21 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index dd77c89..1d5ef24 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -23,6 +23,8 @@
#include "common.h"
+#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+
x264_frame_t *x264_frame_new( x264_t *h )
{
x264_frame_t *frame = x264_malloc( sizeof(x264_frame_t) );
@@ -32,22 +34,16 @@ x264_frame_t *x264_frame_new( x264_t *h )
int i_stride, i_width, i_lines;
int i_padv = PADV << h->param.b_interlaced;
int luma_plane_size;
+ int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
if( !frame ) return NULL;
memset( frame, 0, sizeof(x264_frame_t) );
/* allocate frame data (+64 for extra data for me) */
- i_width = ( ( h->param.i_width + 15 ) & -16 );
- i_stride = i_width + 2*PADH;
- i_lines = ( ( h->param.i_height + 15 ) & -16 );
- if( h->param.b_interlaced )
- i_lines = ( i_lines + 31 ) & -32;
-
- if( h->param.cpu&X264_CPU_CACHELINE_64 )
- i_stride = (i_stride + 63) & ~63;
- else if( h->param.cpu&X264_CPU_CACHELINE_32 )
- i_stride = (i_stride + 31) & ~31;
+ i_width = ALIGN( h->param.i_width, 16 );
+ i_stride = ALIGN( i_width + 2*PADH, align );
+ i_lines = ALIGN( h->param.i_height, 16<<h->param.b_interlaced );
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
@@ -61,27 +57,26 @@ x264_frame_t *x264_frame_new( x264_t *h )
for( i = 1; i < 3; i++ )
{
CHECKED_MALLOC( frame->buffer[i], luma_plane_size/4 );
- frame->plane[i] = (uint8_t*)frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
+ frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
}
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
CHECKED_MALLOC( frame->buffer[0], 4*luma_plane_size);
for( i = 0; i < 4; i++ )
- frame->filtered[i] = (uint8_t*)frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
+ frame->filtered[i] = frame->buffer[0] + i*luma_plane_size + frame->i_stride[0] * i_padv + PADH;
frame->plane[0] = frame->filtered[0];
if( h->frames.b_have_lowres )
{
frame->i_width_lowres = frame->i_width[0]/2;
- frame->i_stride_lowres = (frame->i_width_lowres + 2*PADH + 15) & ~15;
+ frame->i_stride_lowres = ALIGN( frame->i_width_lowres + 2*PADH, align );
frame->i_lines_lowres = frame->i_lines[0]/2;
+
+ luma_plane_size = frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv );
+
+ CHECKED_MALLOC( frame->buffer_lowres[0], 4 * luma_plane_size );
for( i = 0; i < 4; i++ )
- {
- CHECKED_MALLOC( frame->buffer_lowres[i],
- frame->i_stride_lowres * ( frame->i_lines[0]/2 + 2*i_padv ) );
- frame->lowres[i] = ((uint8_t*)frame->buffer_lowres[i]) +
- frame->i_stride_lowres * i_padv + PADH;
- }
+ frame->lowres[i] = frame->buffer_lowres[0] + (frame->i_stride_lowres * i_padv + PADH) + i * luma_plane_size;
}
if( h->param.analyse.i_me_method >= X264_ME_ESA )
diff --git a/common/frame.h b/common/frame.h
index 6da740a..6a0c928 100644
--- a/common/frame.h
+++ b/common/frame.h
@@ -56,8 +56,8 @@ typedef struct
/* for unrestricted mv we allocate more data than needed
* allocated data are stored in buffer */
- void *buffer[4];
- void *buffer_lowres[4];
+ uint8_t *buffer[4];
+ uint8_t *buffer_lowres[4];
/* motion data */
int8_t *mb_type;
More information about the x264-devel
mailing list