[x264-devel] commit: Add CP128/M128 macros using SSE (Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Sat Apr 24 00:40:00 CEST 2010
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Apr 23 19:09:18 2010 +0000| [674926f10f65c28f9a32045be2fd89ee7bc5d8ec] | committer: Jason Garrett-Glaser
Add CP128/M128 macros using SSE
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=674926f10f65c28f9a32045be2fd89ee7bc5d8ec
---
common/common.h | 5 ++++
common/macroblock.c | 53 ++++++++++++++++++++++++--------------------------
common/x86/util.h | 8 +++++++
3 files changed, 38 insertions(+), 28 deletions(-)
diff --git a/common/common.h b/common/common.h
index b8c6dfd..2fc453d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,12 +88,17 @@ do {\
typedef union { uint16_t i; uint8_t c[2]; } MAY_ALIAS x264_union16_t;
typedef union { uint32_t i; uint16_t b[2]; uint8_t c[4]; } MAY_ALIAS x264_union32_t;
typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+typedef struct { uint64_t i[2]; } x264_uint128_t;
+typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
#define M16(src) (((x264_union16_t*)(src))->i)
#define M32(src) (((x264_union32_t*)(src))->i)
#define M64(src) (((x264_union64_t*)(src))->i)
+#define M128(src) (((x264_union128_t*)(src))->i)
+#define M128_ZERO ((x264_uint128_t){{0,0}})
#define CP16(dst,src) M16(dst) = M16(src)
#define CP32(dst,src) M32(dst) = M32(src)
#define CP64(dst,src) M64(dst) = M64(src)
+#define CP128(dst,src) M128(dst) = M128(src)
#include "x264.h"
#include "bs.h"
diff --git a/common/macroblock.c b/common/macroblock.c
index 0b9b903..047558e 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1165,13 +1165,11 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
h->mb.cache.ref[l][i8+2] =
h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
- CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
- CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
+ CP128( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4] );
}
else
{
- M64( h->mb.cache.mv[l][i8+0] ) = 0;
- M64( h->mb.cache.mv[l][i8+2] ) = 0;
+ M128( h->mb.cache.mv[l][i8] ) = M128_ZERO;
M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
}
@@ -1355,35 +1353,38 @@ void x264_macroblock_cache_save( x264_t *h )
h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
- for( int y = 0; y < 4; y++ )
- {
- CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
- CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
- }
+ CP128( h->mb.mv[0][i_mb_4x4+0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
+ CP128( h->mb.mv[0][i_mb_4x4+1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
+ CP128( h->mb.mv[0][i_mb_4x4+2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
+ CP128( h->mb.mv[0][i_mb_4x4+3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
if( h->sh.i_type == SLICE_TYPE_B )
{
h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
- for( int y = 0; y < 4; y++ )
- {
- CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
- CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
- }
+ CP128( h->mb.mv[1][i_mb_4x4+0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
+ CP128( h->mb.mv[1][i_mb_4x4+1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
+ CP128( h->mb.mv[1][i_mb_4x4+2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
+ CP128( h->mb.mv[1][i_mb_4x4+3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
}
}
else
{
- for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2 : 1 ); i_list++ )
+ M16( &h->mb.ref[0][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[0][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M128( h->mb.mv[0][i_mb_4x4+0*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[0][i_mb_4x4+1*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[0][i_mb_4x4+2*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[0][i_mb_4x4+3*s4x4] ) = M128_ZERO;
+ if( h->sh.i_type == SLICE_TYPE_B )
{
- M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
- M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
- for( int y = 0; y < 4; y++ )
- {
- M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
- M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
- }
+ M16( &h->mb.ref[1][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M16( &h->mb.ref[1][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
+ M128( h->mb.mv[1][i_mb_4x4+0*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[1][i_mb_4x4+1*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[1][i_mb_4x4+2*s4x4] ) = M128_ZERO;
+ M128( h->mb.mv[1][i_mb_4x4+3*s4x4] ) = M128_ZERO;
}
}
}
@@ -1411,13 +1412,9 @@ void x264_macroblock_cache_save( x264_t *h )
}
else
{
- M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
- M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
+ M128( h->mb.mvd[0][i_mb_xy][0] ) = M128_ZERO;
if( h->sh.i_type == SLICE_TYPE_B )
- {
- M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
- M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
- }
+ M128( h->mb.mvd[1][i_mb_xy][0] ) = M128_ZERO;
}
if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/common/x86/util.h b/common/x86/util.h
index ccc0733..4c4d168 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -25,6 +25,9 @@
#define X264_X86_UTIL_H
#ifdef __GNUC__
+
+#include <xmmintrin.h>
+
#define x264_median_mv x264_median_mv_mmxext
static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
{
@@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
);
return amvd;
}
+#undef M128_ZERO
+#define M128_ZERO ((__m128){0,0,0,0})
+#define x264_union128_t x264_union128_sse_t
+typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+
#endif
#endif
More information about the x264-devel
mailing list