[x264-devel] Add SSE support to rectangle.h for 16-byte stores

Wed Apr 13 04:04:29 CEST 2011

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Mar 29 05:33:44 2011 -0700| [f422ec93254ed3f9883acac0bb3f67e3b4ea960c] | committer: Jason Garrett-Glaser

Add SSE support to rectangle.h for 16-byte stores
Uses GCC vector intrinsics; may be suboptimal on particularly old GCC versions.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f422ec93254ed3f9883acac0bb3f67e3b4ea960c
---

 common/common.h    |    3 ++-
 common/rectangle.h |   10 ++++++++++
 common/x86/util.h  |    3 +++
 configure          |    4 +++-
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/common/common.h b/common/common.h
index fcf0250..496542e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -851,11 +851,12 @@ struct x264_t
 
 // included at the end because it needs x264_t
 #include "macroblock.h"
-#include "rectangle.h"
 
 #if HAVE_MMX
 #include "x86/util.h"
 #endif
 
+#include "rectangle.h"
+
 #endif
 
diff --git a/common/rectangle.h b/common/rectangle.h
index aeaa2b9..770de2c 100644
--- a/common/rectangle.h
+++ b/common/rectangle.h
@@ -80,6 +80,15 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, i
     {
         /* height 1, width 16 doesn't occur */
         assert( h != 1 );
+#if HAVE_VECTOREXT && defined(__SSE__)
+        v4si v16 = {v,v,v,v};
+
+        M128( d+s*0+0 ) = (__m128)v16;
+        M128( d+s*1+0 ) = (__m128)v16;
+        if( h == 2 ) return;
+        M128( d+s*2+0 ) = (__m128)v16;
+        M128( d+s*3+0 ) = (__m128)v16;
+#else
         if( WORD_SIZE == 8 )
         {
             do
@@ -103,6 +112,7 @@ static ALWAYS_INLINE void x264_macroblock_cache_rect( void *dst, int w, int h, i
                 d += s;
             } while( --h );
         }
+#endif
     }
     else
         assert(0);
diff --git a/common/x86/util.h b/common/x86/util.h
index 0b786cf..1e91c3b 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -154,6 +154,9 @@ static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*dst)[2], in
 #define M128_ZERO ((__m128){0,0,0,0})
 #define x264_union128_t x264_union128_sse_t
 typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
+#if HAVE_VECTOREXT
+typedef uint32_t v4si __attribute__((vector_size (16)));
+#endif
 #endif
 
 #endif
diff --git a/configure b/configure
index 29977a2..1a7cb33 100755
--- a/configure
+++ b/configure
@@ -223,7 +223,7 @@ cross_prefix=""
 EXE=""
 
 # list of all preprocessor HAVE values we can define
-CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL"
+CONFIG_HAVE="MALLOC_H ALTIVEC ALTIVEC_H MMX ARMV6 ARMV6T2 NEON BEOSTHREAD POSIXTHREAD WIN32THREAD THREAD LOG2F VISUALIZE SWSCALE LAVF FFMS GPAC GF_MALLOC AVS GPL VECTOREXT"
 
 # parse options
 
@@ -812,6 +812,8 @@ if [ "$avs" = "auto" ] ; then
     fi
 fi
 
+cc_check "stdint.h" "" "uint32_t test_vec __attribute__ ((vector_size (16))) = {0,1,2,3};" && define HAVE_VECTOREXT
+
 if [ "$pic" = "yes" ] ; then
     CFLAGS="$CFLAGS -fPIC"
     ASFLAGS="$ASFLAGS -DPIC"