[x264-devel] commit: MMX code for predictor rounding/clipping (Jason Garrett-Glaser )

Sat Apr 24 00:40:01 CEST 2010

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Apr 16 12:06:07 2010 -0700| [aaf2194fcd9efacbb835af5fe3390ad48d19cc8c] | committer: Jason Garrett-Glaser 

MMX code for predictor rounding/clipping
Faster predictor checking at subme < 3.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=aaf2194fcd9efacbb835af5fe3390ad48d19cc8c
---

 common/common.h   |   11 +++++++++++
 common/x86/util.h |   41 +++++++++++++++++++++++++++++++++++++++++
 encoder/me.c      |   11 ++++++-----
 3 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/common/common.h b/common/common.h
index c63fbd9..3973558 100644
--- a/common/common.h
+++ b/common/common.h
@@ -188,6 +188,17 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum( uint8_t *mvdleft, uint8_t *mvd
     return amvd0 + (amvd1<<8);
 }
 
+static void ALWAYS_INLINE x264_predictor_roundclip( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        int mx = (mvc[i][0] + 2) >> 2;
+        int my = (mvc[i][1] + 2) >> 2;
+        mvc[i][0] = x264_clip3( mx, mv_x_min, mv_x_max );
+        mvc[i][0] = x264_clip3( my, mv_y_min, mv_y_max );
+    }
+}
+
 extern const uint8_t x264_exp2_lut[64];
 extern const float x264_log2_lut[128];
 extern const float x264_log2_lz_lut[32];
diff --git a/common/x86/util.h b/common/x86/util.h
index 4c4d168..8fb1e84 100644
--- a/common/x86/util.h
+++ b/common/x86/util.h
@@ -45,6 +45,7 @@ static ALWAYS_INLINE void x264_median_mv_mmxext( int16_t *dst, int16_t *a, int16
         :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
     );
 }
+
 #define x264_predictor_difference x264_predictor_difference_mmxext
 static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], intptr_t i_mvc )
 {
@@ -80,6 +81,7 @@ static ALWAYS_INLINE int x264_predictor_difference_mmxext( int16_t (*mvc)[2], in
     );
     return sum;
 }
+
 #define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmxext
 static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_t *mvdtop)
 {
@@ -103,6 +105,45 @@ static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmxext(uint8_t *mvdleft, uint8_
     );
     return amvd;
 }
+
+#define x264_predictor_roundclip x264_predictor_roundclip_mmxext
+static void ALWAYS_INLINE x264_predictor_roundclip_mmxext( int16_t (*mvc)[2], int i_mvc, int mv_x_min, int mv_x_max, int mv_y_min, int mv_y_max )
+{
+    uint32_t mv_min = pack16to32_mask( mv_x_min, mv_y_min );
+    uint32_t mv_max = pack16to32_mask( mv_x_max, mv_y_max );
+    static const uint64_t pw_2 = 0x0002000200020002ULL;
+    intptr_t i = i_mvc;
+    asm(
+        "movd    %2, %%mm5       \n"
+        "movd    %3, %%mm6       \n"
+        "movq    %4, %%mm7       \n"
+        "punpckldq %%mm5, %%mm5  \n"
+        "punpckldq %%mm6, %%mm6  \n"
+        "test $1, %0             \n"
+        "jz 1f                   \n"
+        "movd -4(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movd %%mm0, -4(%5,%0,4) \n"
+        "dec %0                  \n"
+        "jz 2f                   \n"
+        "1:                      \n"
+        "movq -8(%5,%0,4), %%mm0 \n"
+        "paddw %%mm7, %%mm0      \n"
+        "psraw $2, %%mm0         \n"
+        "pmaxsw %%mm5, %%mm0     \n"
+        "pminsw %%mm6, %%mm0     \n"
+        "movq %%mm0, -8(%5,%0,4) \n"
+        "sub $2, %0              \n"
+        "jnz 1b                  \n"
+        "2:                      \n"
+        :"+r"(i), "+m"(M64( mvc ))
+        :"g"(mv_min), "g"(mv_max), "m"(pw_2), "r"(mvc)
+    );
+}
+
 #undef M128_ZERO
 #define M128_ZERO ((__m128){0,0,0,0})
 #define x264_union128_t x264_union128_sse_t
diff --git a/encoder/me.c b/encoder/me.c
index 6788022..0b519ea 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -241,14 +241,15 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
          * sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
          * biasing against use of the predicted motion vector. */
         bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
+        uint32_t bmv = pack16to32_mask( bmx, bmy );
+        if( i_mvc )
+            x264_predictor_roundclip( mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
         for( int i = 0; i < i_mvc; i++ )
         {
-            int mx = (mvc[i][0] + 2) >> 2;
-            int my = (mvc[i][1] + 2) >> 2;
-            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
+            if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
             {
-                mx = x264_clip3( mx, mv_x_min, mv_x_max );
-                my = x264_clip3( my, mv_y_min, mv_y_max );
+                int mx = mvc[i][0];
+                int my = mvc[i][1];
                 COST_MV( mx, my );
             }
         }