[x264-devel] commit: Prefetch MB data in cache_load (Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Sat Apr 24 00:40:00 CEST 2010
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Apr 14 14:43:25 2010 -0700| [a7ec5e2352c516f8c14bdd79ea9d1522d3ddae16] | committer: Jason Garrett-Glaser
Prefetch MB data in cache_load
Dramatically reduces L1 cache misses.
~10% faster cache_load.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a7ec5e2352c516f8c14bdd79ea9d1522d3ddae16
---
common/macroblock.c | 33 ++++++++++++++++++++++++++++-----
common/osdep.h | 16 ++++++++++++++++
2 files changed, 44 insertions(+), 5 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index 56ad4ce..f1e24a0 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -941,6 +941,7 @@ static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x
static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, int mb_y )
{
int top = (mb_y - (1 << h->mb.b_interlaced)) * h->mb.i_mb_stride + mb_x;
+
h->mb.i_mb_x = mb_x;
h->mb.i_mb_y = mb_y;
h->mb.i_mb_xy = mb_y * h->mb.i_mb_stride + mb_x;
@@ -986,6 +987,16 @@ static void inline x264_macroblock_cache_load_neighbours( x264_t *h, int mb_x, i
if( !h->param.b_constrained_intra || IS_INTRA( h->mb.i_mb_type_top ) )
h->mb.i_neighbour_intra |= MB_TOP;
+
+ /* We only need to prefetch the top blocks because the left was just written
+ * to as part of the previous cache_save. Since most target CPUs use write-allocate
+ * caches, left blocks are near-guaranteed to be in L1 cache. Top--not so much. */
+ x264_prefetch( &h->mb.cbp[top] );
+ x264_prefetch( h->mb.intra4x4_pred_mode[top] );
+ x264_prefetch( &h->mb.non_zero_count[top][12] );
+ /* These aren't always allocated, but prefetching an invalid address can't hurt. */
+ x264_prefetch( &h->mb.mb_transform_size[top] );
+ x264_prefetch( &h->mb.skipbp[top] );
}
}
@@ -1025,6 +1036,9 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
int left = h->mb.i_mb_left_xy;
int top = h->mb.i_mb_top_xy;
+ int top_y = mb_y - (1 << h->mb.b_interlaced);
+ int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
+ int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
/* GCC pessimizes direct loads from heap-allocated arrays due to aliasing. */
/* By only dereferencing them once, we avoid this issue. */
@@ -1079,6 +1093,18 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
h->mb.cache.non_zero_count[x264_scan8[16+4+0] - 1] = nnz[left][16+4+1];
h->mb.cache.non_zero_count[x264_scan8[16+4+2] - 1] = nnz[left][16+4+3];
+
+ /* Finish the prefetching */
+ if( h->sh.i_type != SLICE_TYPE_I )
+ for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
+ {
+ x264_prefetch( &h->mb.mv[l][top_4x4-1] );
+ /* Top right being not in the same cacheline as top left will happen
+ * once every 4 MBs, so one extra prefetch is worthwhile */
+ x264_prefetch( &h->mb.mv[l][top_4x4+4] );
+ x264_prefetch( &h->mb.ref[l][top_8x8-1] );
+ x264_prefetch( &h->mb.mvd[l][top] );
+ }
}
else
{
@@ -1143,11 +1169,8 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
/* load ref/mv/mvd */
if( h->sh.i_type != SLICE_TYPE_I )
{
- const int s8x8 = h->mb.i_b8_stride;
- const int s4x4 = h->mb.i_b4_stride;
- const int top_y = mb_y - (1 << h->mb.b_interlaced);
- const int top_8x8 = (2*top_y+1) * h->mb.i_b8_stride + 2*mb_x;
- const int top_4x4 = (4*top_y+3) * h->mb.i_b4_stride + 4*mb_x;
+ int s8x8 = h->mb.i_b8_stride;
+ int s4x4 = h->mb.i_b4_stride;
for( int l = 0; l < (h->sh.i_type == SLICE_TYPE_B) + 1; l++ )
{
diff --git a/common/osdep.h b/common/osdep.h
index f97547f..4f49d30 100644
--- a/common/osdep.h
+++ b/common/osdep.h
@@ -251,6 +251,22 @@ static int ALWAYS_INLINE x264_ctz( uint32_t x )
}
#endif
+#if defined(__GNUC__) && defined(HAVE_MMX)
+/* Don't use __builtin_prefetch; even as recent as 4.3.4, GCC seems incapable of
+ * using complex address modes properly unless we use inline asm. */
+static ALWAYS_INLINE void x264_prefetch( void *p )
+{
+ asm volatile( "prefetcht0 %0"::"m"(*(uint8_t*)p) );
+}
+/* We require that prefetch not fault on invalid reads, so we only enable it on
+ * known architectures. */
+#elif defined(__GNUC__) && (__GNUC__ > 3 || __GNUC__ == 3 && __GNUC_MINOR__ > 1) &&\
+ (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_ARM) || defined(ARCH_PPC))
+#define x264_prefetch(x) __builtin_prefetch(x)
+#else
+#define x264_prefetch(x)
+#endif
+
#ifdef USE_REAL_PTHREAD
#ifdef SYS_MINGW
#define x264_lower_thread_priority(p)\
More information about the x264-devel
mailing list