[x264-devel] commit: Add CP128/M128 macros using SSE (Jason Garrett-Glaser )

Sat Apr 24 00:40:00 CEST 2010

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Apr 23 19:09:18 2010 +0000| [674926f10f65c28f9a32045be2fd89ee7bc5d8ec] | committer: Jason Garrett-Glaser 

Add CP128/M128 macros using SSE

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=674926f10f65c28f9a32045be2fd89ee7bc5d8ec
---

 common/common.h     |    5 ++++
 common/macroblock.c |   53 ++++++++++++++++++++++++--------------------------
 common/x86/util.h   |    8 +++++++
 3 files changed, 38 insertions(+), 28 deletions(-)

diff --git a/common/common.h b/common/common.h
index b8c6dfd..2fc453d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -88,12 +88,17 @@ do {\
 typedef union { uint16_t i; uint8_t  c[2]; } MAY_ALIAS x264_union16_t;
 typedef union { uint32_t i; uint16_t b[2]; uint8_t  c[4]; } MAY_ALIAS x264_union32_t;
 typedef union { uint64_t i; uint32_t a[2]; uint16_t b[4]; uint8_t c[8]; } MAY_ALIAS x264_union64_t;
+typedef struct { uint64_t i[2]; } x264_uint128_t;
+typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_t;
 #define M16(src) (((x264_union16_t*)(src))->i)
 #define M32(src) (((x264_union32_t*)(src))->i)
 #define M64(src) (((x264_union64_t*)(src))->i)
+#define M128(src) (((x264_union128_t*)(src))->i)
+#define M128_ZERO ((x264_uint128_t){{0,0}})
 #define CP16(dst,src) M16(dst) = M16(src)
 #define CP32(dst,src) M32(dst) = M32(src)
 #define CP64(dst,src) M64(dst) = M64(src)
+#define CP128(dst,src) M128(dst) = M128(src)
 
 #include "x264.h"
 #include "bs.h"
diff --git a/common/macroblock.c b/common/macroblock.c
index 0b9b903..047558e 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -1165,13 +1165,11 @@ void x264_macroblock_cache_load( x264_t *h, int mb_x, int mb_y )
                 h->mb.cache.ref[l][i8+1] = h->mb.ref[l][top_8x8 + 0];
                 h->mb.cache.ref[l][i8+2] =
                 h->mb.cache.ref[l][i8+3] = h->mb.ref[l][top_8x8 + 1];
-                CP64( h->mb.cache.mv[l][i8+0], h->mb.mv[l][top_4x4+0] );
-                CP64( h->mb.cache.mv[l][i8+2], h->mb.mv[l][top_4x4+2] );
+                CP128( h->mb.cache.mv[l][i8], h->mb.mv[l][top_4x4] );
             }
             else
             {
-                M64( h->mb.cache.mv[l][i8+0] ) = 0;
-                M64( h->mb.cache.mv[l][i8+2] ) = 0;
+                M128( h->mb.cache.mv[l][i8] ) = M128_ZERO;
                 M32( &h->mb.cache.ref[l][i8] ) = (uint8_t)(-2) * 0x01010101U;
             }
 
@@ -1355,35 +1353,38 @@ void x264_macroblock_cache_save( x264_t *h )
             h->mb.ref[0][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[0][x264_scan8[4]];
             h->mb.ref[0][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[0][x264_scan8[8]];
             h->mb.ref[0][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[0][x264_scan8[12]];
-            for( int y = 0; y < 4; y++ )
-            {
-                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[0][x264_scan8[0]+8*y+0] );
-                CP64( h->mb.mv[0][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[0][x264_scan8[0]+8*y+2] );
-            }
+            CP128( h->mb.mv[0][i_mb_4x4+0*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*0] );
+            CP128( h->mb.mv[0][i_mb_4x4+1*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*1] );
+            CP128( h->mb.mv[0][i_mb_4x4+2*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*2] );
+            CP128( h->mb.mv[0][i_mb_4x4+3*s4x4], h->mb.cache.mv[0][x264_scan8[0]+8*3] );
             if( h->sh.i_type == SLICE_TYPE_B )
             {
                 h->mb.ref[1][i_mb_8x8+0+0*s8x8] = h->mb.cache.ref[1][x264_scan8[0]];
                 h->mb.ref[1][i_mb_8x8+1+0*s8x8] = h->mb.cache.ref[1][x264_scan8[4]];
                 h->mb.ref[1][i_mb_8x8+0+1*s8x8] = h->mb.cache.ref[1][x264_scan8[8]];
                 h->mb.ref[1][i_mb_8x8+1+1*s8x8] = h->mb.cache.ref[1][x264_scan8[12]];
-                for( int y = 0; y < 4; y++ )
-                {
-                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+0], h->mb.cache.mv[1][x264_scan8[0]+8*y+0] );
-                    CP64( h->mb.mv[1][i_mb_4x4+y*s4x4+2], h->mb.cache.mv[1][x264_scan8[0]+8*y+2] );
-                }
+                CP128( h->mb.mv[1][i_mb_4x4+0*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*0] );
+                CP128( h->mb.mv[1][i_mb_4x4+1*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*1] );
+                CP128( h->mb.mv[1][i_mb_4x4+2*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*2] );
+                CP128( h->mb.mv[1][i_mb_4x4+3*s4x4], h->mb.cache.mv[1][x264_scan8[0]+8*3] );
             }
         }
         else
         {
-            for( int i_list = 0; i_list < (h->sh.i_type == SLICE_TYPE_B ? 2  : 1 ); i_list++ )
+            M16( &h->mb.ref[0][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+            M16( &h->mb.ref[0][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
+            M128( h->mb.mv[0][i_mb_4x4+0*s4x4] ) = M128_ZERO;
+            M128( h->mb.mv[0][i_mb_4x4+1*s4x4] ) = M128_ZERO;
+            M128( h->mb.mv[0][i_mb_4x4+2*s4x4] ) = M128_ZERO;
+            M128( h->mb.mv[0][i_mb_4x4+3*s4x4] ) = M128_ZERO;
+            if( h->sh.i_type == SLICE_TYPE_B )
             {
-                M16( &h->mb.ref[i_list][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                M16( &h->mb.ref[i_list][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
-                for( int y = 0; y < 4; y++ )
-                {
-                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+0] ) = 0;
-                    M64( h->mb.mv[i_list][i_mb_4x4+y*s4x4+2] ) = 0;
-                }
+                M16( &h->mb.ref[1][i_mb_8x8+0*s8x8] ) = (uint8_t)(-1) * 0x0101;
+                M16( &h->mb.ref[1][i_mb_8x8+1*s8x8] ) = (uint8_t)(-1) * 0x0101;
+                M128( h->mb.mv[1][i_mb_4x4+0*s4x4] ) = M128_ZERO;
+                M128( h->mb.mv[1][i_mb_4x4+1*s4x4] ) = M128_ZERO;
+                M128( h->mb.mv[1][i_mb_4x4+2*s4x4] ) = M128_ZERO;
+                M128( h->mb.mv[1][i_mb_4x4+3*s4x4] ) = M128_ZERO;
             }
         }
     }
@@ -1411,13 +1412,9 @@ void x264_macroblock_cache_save( x264_t *h )
         }
         else
         {
-            M64( h->mb.mvd[0][i_mb_xy][0] ) = 0;
-            M64( h->mb.mvd[0][i_mb_xy][4] ) = 0;
+            M128( h->mb.mvd[0][i_mb_xy][0] ) = M128_ZERO;
             if( h->sh.i_type == SLICE_TYPE_B )
-            {
-                M64( h->mb.mvd[1][i_mb_xy][0] ) = 0;
-                M64( h->mb.mvd[1][i_mb_xy][4] ) = 0;
-            }
+                M128( h->mb.mvd[1][i_mb_xy][0] ) = M128_ZERO;
         }
 
         if( h->sh.i_type == SLICE_TYPE_B )
diff --git a/common/x86/util.h b/common/x86/util.h
index ccc0733..4c4d168 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -25,6 +25,9 @@
 #define X264_X86_UTIL_H
 
 #ifdef __GNUC__
+
+#include <xmmintrin.h>
+
 #define x264_median_mv x264_median_mv_mmxext
 static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
 {
@@ -100,6 +103,11 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
     );
     return amvd;
 }
+#undef M128_ZERO
+#define M128_ZERO ((__m128){0,0,0,0})
+#define x264_union128_t x264_union128_sse_t
+typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+
 #endif
 
 #endif