[x264-devel] commit: Optimize variance asm + minor changes (Jason Garrett-Glaser )

git version control git at videolan.org
Wed Dec 24 18:30:54 CET 2008


x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Dec 23 22:33:28 2008 -0500| [42070dff1bc3019a6f56773fce3dd6e328e3a61b] | committer: Jason Garrett-Glaser 

Optimize variance asm + minor changes
Remove SAD argument from var, not needed anymore.
Speed up var asm a bit by eliminating psadbw and instead HADDWing at end.
Eliminate all remaining warnings on gcc 3.4 on cygwin
Port another minor optimization from lavc (pskip)

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=42070dff1bc3019a6f56773fce3dd6e328e3a61b
---

 common/macroblock.c    |    4 ++--
 common/pixel.c         |    3 +--
 common/pixel.h         |    2 +-
 common/x86/pixel-a.asm |   28 ++++++++++------------------
 common/x86/pixel.h     |    4 ++--
 encoder/ratecontrol.c  |    6 +++---
 muxers.c               |    2 +-
 tools/checkasm.c       |   11 +++++------
 8 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/common/macroblock.c b/common/macroblock.c
index 9ddc997..795367b 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -140,8 +140,8 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
     int16_t *mv_b  = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
 
     if( i_refa == -2 || i_refb == -2 ||
-        ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
-        ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
+        !( i_refa | *(uint32_t*)mv_a ) ||
+        !( i_refb | *(uint32_t*)mv_b ) )
     {
         *(uint32_t*)mv = 0;
     }
diff --git a/common/pixel.c b/common/pixel.c
index 746c95a..1c37b31 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -140,7 +140,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
  * pixel_var_wxh
  ****************************************************************************/
 #define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+static int name( uint8_t *pix, int i_stride ) \
 {                                             \
     uint32_t var = 0, sum = 0, sqr = 0;       \
     int x, y;                                 \
@@ -154,7 +154,6 @@ static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
         pix += i_stride;                      \
     }                                         \
     var = sqr - (sum * sum >> shift);         \
-    *sad = sum;                               \
     return var;                               \
 }
 
diff --git a/common/pixel.h b/common/pixel.h
index 4e157ef..1a2cefd 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -74,7 +74,7 @@ typedef struct
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
 
-    int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
+    int (*var[4])( uint8_t *pix, int stride );
     uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d249f12..6314e56 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -237,13 +237,8 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
 %endmacro
 
 %macro VAR_END 1
-%if mmsize == 16
-    movhlps m0, m5
-    paddw   m5, m0
-%endif
-    movifnidn r2d, r2m
+    HADDW   m5, m7
     movd   r1d, m5
-    movd  [r2], m5  ; return sum
     imul   r1d, r1d
     HADDD   m6, m1
     shr    r1d, %1
@@ -258,27 +253,25 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
     mova      m0, [r0]
     mova      m1, m0
     mova      m3, [r0+%1]
-    mova      m2, m0
-    punpcklbw m0, m7
     mova      m4, m3
+    punpcklbw m0, m7
     punpckhbw m1, m7
 %ifidn %1, r1
     lea       r0, [r0+%1*2]
 %else
     add       r0, r1
 %endif
-    punpckhbw m4, m7
-    psadbw    m2, m7
-    paddw     m5, m2
-    mova      m2, m3
     punpcklbw m3, m7
+    punpckhbw m4, m7
+    paddw     m5, m0
     dec t3d
-    psadbw    m2, m7
     pmaddwd   m0, m0
-    paddw     m5, m2
+    paddw     m5, m1
     pmaddwd   m1, m1
+    paddw     m5, m3
     paddd     m6, m0
     pmaddwd   m3, m3
+    paddw     m5, m4
     paddd     m6, m1
     pmaddwd   m4, m4
     paddd     m6, m3
@@ -287,7 +280,7 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
 ;-----------------------------------------------------------------------------
 INIT_MMX
 cglobal x264_pixel_var_16x16_mmxext, 2,3
@@ -315,13 +308,12 @@ cglobal x264_pixel_var_8x8_sse2, 2,3
     lea       r0, [r0+r1*2]
     mova      m1, m0
     punpcklbw m0, m7
-    mova      m2, m1
     punpckhbw m1, m7
     dec t3d
+    paddw     m5, m0
+    paddw     m5, m1
     pmaddwd   m0, m0
     pmaddwd   m1, m1
-    psadbw    m2, m7
-    paddw     m5, m2
     paddd     m6, m0
     paddd     m6, m1
     jnz .loop
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index e0c9733..5bc81c7 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -67,8 +67,8 @@ DECL_X4( sad, cache64_mmxext );
 DECL_X4( sad, cache64_sse2 );
 DECL_X4( sad, cache64_ssse3 );
 
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
-DECL_PIXELS( int, var, sse2,   ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( int, var, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, sse2,   ( uint8_t *pix, int i_stride ))
 DECL_PIXELS( uint64_t, hadamard_ac, ssse3,  ( uint8_t *pix, int i_stride ))
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 712a721..5a74172 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -174,8 +174,8 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
      * and putting it after floating point ops.  As a result, we put the emms at the end of the
      * function and make sure that its always called before the float math.  Noinline makes
      * sure no reordering goes on. */
-    unsigned int var=0, sad, i;
-    for( i=0; i<3; i++ )
+    unsigned int var = 0, i;
+    for( i = 0; i < 3; i++ )
     {
         int w = i ? 8 : 16;
         int stride = frame->i_stride[i];
@@ -184,7 +184,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
             : w * (mb_x + mb_y * stride);
         int pix = i ? PIXEL_8x8 : PIXEL_16x16;
         stride <<= h->mb.b_interlaced;
-        var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
+        var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
     }
     var = X264_MAX(var,1);
     x264_emms();
diff --git a/muxers.c b/muxers.c
index 51f82bb..fffa99b 100644
--- a/muxers.c
+++ b/muxers.c
@@ -290,7 +290,7 @@ int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame )
     header[slen] = 0;
     if (strncmp(header, Y4M_FRAME_MAGIC, slen))
     {
-        fprintf(stderr, "Bad header magic (%08X <=> %s)\n",
+        fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
                 *((uint32_t*)header), header);
         return -1;
     }
diff --git a/tools/checkasm.c b/tools/checkasm.c
index fff9844..fd38f1f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -330,16 +330,15 @@ static int check_pixel( int cpu_ref, int cpu_new )
 #define TEST_PIXEL_VAR( i ) \
     if( pixel_asm.var[i] != pixel_ref.var[i] ) \
     { \
-        uint32_t res_c, res_asm; \
-        uint32_t sad_c, sad_asm; \
+        int res_c, res_asm; \
         set_func_name( "%s_%s", "var", pixel_names[i] ); \
         used_asm = 1; \
-        res_c   = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
-        res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
-        if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+        res_c   = call_c( pixel_c.var[i], buf1, 16 ); \
+        res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+        if( res_c != res_asm ) \
         { \
             ok = 0; \
-            fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+            fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
         } \
     }
 



More information about the x264-devel mailing list