[x264-devel] commit: Optimize variance asm + minor changes (Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Dec 24 18:30:54 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Dec 23 22:33:28 2008 -0500| [42070dff1bc3019a6f56773fce3dd6e328e3a61b] | committer: Jason Garrett-Glaser
Optimize variance asm + minor changes
Remove SAD argument from var, not needed anymore.
Speed up var asm a bit by eliminating psadbw and instead HADDWing at end.
Eliminate all remaining warnings on gcc 3.4 on cygwin
Port another minor optimization from lavc (pskip)
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=42070dff1bc3019a6f56773fce3dd6e328e3a61b
---
common/macroblock.c | 4 ++--
common/pixel.c | 3 +--
common/pixel.h | 2 +-
common/x86/pixel-a.asm | 28 ++++++++++------------------
common/x86/pixel.h | 4 ++--
encoder/ratecontrol.c | 6 +++---
muxers.c | 2 +-
tools/checkasm.c | 11 +++++------
8 files changed, 25 insertions(+), 35 deletions(-)
diff --git a/common/macroblock.c b/common/macroblock.c
index 9ddc997..795367b 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -140,8 +140,8 @@ void x264_mb_predict_mv_pskip( x264_t *h, int16_t mv[2] )
int16_t *mv_b = h->mb.cache.mv[0][X264_SCAN8_0 - 8];
if( i_refa == -2 || i_refb == -2 ||
- ( i_refa == 0 && *(uint32_t*)mv_a == 0 ) ||
- ( i_refb == 0 && *(uint32_t*)mv_b == 0 ) )
+ !( i_refa | *(uint32_t*)mv_a ) ||
+ !( i_refb | *(uint32_t*)mv_b ) )
{
*(uint32_t*)mv = 0;
}
diff --git a/common/pixel.c b/common/pixel.c
index 746c95a..1c37b31 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -140,7 +140,7 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
* pixel_var_wxh
****************************************************************************/
#define PIXEL_VAR_C( name, w, shift ) \
-static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+static int name( uint8_t *pix, int i_stride ) \
{ \
uint32_t var = 0, sum = 0, sqr = 0; \
int x, y; \
@@ -154,7 +154,6 @@ static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
pix += i_stride; \
} \
var = sqr - (sum * sum >> shift); \
- *sad = sum; \
return var; \
}
diff --git a/common/pixel.h b/common/pixel.h
index 4e157ef..1a2cefd 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -74,7 +74,7 @@ typedef struct
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
- int (*var[4])( uint8_t *pix, int stride, uint32_t *sad );
+ int (*var[4])( uint8_t *pix, int stride );
uint64_t (*hadamard_ac[4])( uint8_t *pix, int stride );
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index d249f12..6314e56 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -237,13 +237,8 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
%endmacro
%macro VAR_END 1
-%if mmsize == 16
- movhlps m0, m5
- paddw m5, m0
-%endif
- movifnidn r2d, r2m
+ HADDW m5, m7
movd r1d, m5
- movd [r2], m5 ; return sum
imul r1d, r1d
HADDD m6, m1
shr r1d, %1
@@ -258,27 +253,25 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
mova m0, [r0]
mova m1, m0
mova m3, [r0+%1]
- mova m2, m0
- punpcklbw m0, m7
mova m4, m3
+ punpcklbw m0, m7
punpckhbw m1, m7
%ifidn %1, r1
lea r0, [r0+%1*2]
%else
add r0, r1
%endif
- punpckhbw m4, m7
- psadbw m2, m7
- paddw m5, m2
- mova m2, m3
punpcklbw m3, m7
+ punpckhbw m4, m7
+ paddw m5, m0
dec t3d
- psadbw m2, m7
pmaddwd m0, m0
- paddw m5, m2
+ paddw m5, m1
pmaddwd m1, m1
+ paddw m5, m3
paddd m6, m0
pmaddwd m3, m3
+ paddw m5, m4
paddd m6, m1
pmaddwd m4, m4
paddd m6, m3
@@ -287,7 +280,7 @@ cglobal x264_pixel_ssd_4x4_sse4, 4,4
%endmacro
;-----------------------------------------------------------------------------
-; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int )
;-----------------------------------------------------------------------------
INIT_MMX
cglobal x264_pixel_var_16x16_mmxext, 2,3
@@ -315,13 +308,12 @@ cglobal x264_pixel_var_8x8_sse2, 2,3
lea r0, [r0+r1*2]
mova m1, m0
punpcklbw m0, m7
- mova m2, m1
punpckhbw m1, m7
dec t3d
+ paddw m5, m0
+ paddw m5, m1
pmaddwd m0, m0
pmaddwd m1, m1
- psadbw m2, m7
- paddw m5, m2
paddd m6, m0
paddd m6, m1
jnz .loop
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index e0c9733..5bc81c7 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -67,8 +67,8 @@ DECL_X4( sad, cache64_mmxext );
DECL_X4( sad, cache64_sse2 );
DECL_X4( sad, cache64_ssse3 );
-DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride, uint32_t *sad ))
-DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride, uint32_t *sad ))
+DECL_PIXELS( int, var, mmxext, ( uint8_t *pix, int i_stride ))
+DECL_PIXELS( int, var, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, mmxext, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, sse2, ( uint8_t *pix, int i_stride ))
DECL_PIXELS( uint64_t, hadamard_ac, ssse3, ( uint8_t *pix, int i_stride ))
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 712a721..5a74172 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -174,8 +174,8 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
* and putting it after floating point ops. As a result, we put the emms at the end of the
* function and make sure that its always called before the float math. Noinline makes
* sure no reordering goes on. */
- unsigned int var=0, sad, i;
- for( i=0; i<3; i++ )
+ unsigned int var = 0, i;
+ for( i = 0; i < 3; i++ )
{
int w = i ? 8 : 16;
int stride = frame->i_stride[i];
@@ -184,7 +184,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, x264_frame_t *f
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
- var += h->pixf.var[pix]( frame->plane[i]+offset, stride, &sad );
+ var += h->pixf.var[pix]( frame->plane[i]+offset, stride );
}
var = X264_MAX(var,1);
x264_emms();
diff --git a/muxers.c b/muxers.c
index 51f82bb..fffa99b 100644
--- a/muxers.c
+++ b/muxers.c
@@ -290,7 +290,7 @@ int read_frame_y4m( x264_picture_t *p_pic, hnd_t handle, int i_frame )
header[slen] = 0;
if (strncmp(header, Y4M_FRAME_MAGIC, slen))
{
- fprintf(stderr, "Bad header magic (%08X <=> %s)\n",
+ fprintf(stderr, "Bad header magic (%"PRIx32" <=> %s)\n",
*((uint32_t*)header), header);
return -1;
}
diff --git a/tools/checkasm.c b/tools/checkasm.c
index fff9844..fd38f1f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -330,16 +330,15 @@ static int check_pixel( int cpu_ref, int cpu_new )
#define TEST_PIXEL_VAR( i ) \
if( pixel_asm.var[i] != pixel_ref.var[i] ) \
{ \
- uint32_t res_c, res_asm; \
- uint32_t sad_c, sad_asm; \
+ int res_c, res_asm; \
set_func_name( "%s_%s", "var", pixel_names[i] ); \
used_asm = 1; \
- res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
- res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
- if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+ res_c = call_c( pixel_c.var[i], buf1, 16 ); \
+ res_asm = call_a( pixel_asm.var[i], buf1, 16 ); \
+ if( res_c != res_asm ) \
{ \
ok = 0; \
- fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+ fprintf( stderr, "var[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
} \
}
More information about the x264-devel
mailing list