[x264-devel] MBAFF: Create a VSAD DSP function
Simon Horlick
git at videolan.org
Thu May 12 08:39:06 CEST 2011
x264 | branch: master | Simon Horlick <simonhorlick at gmail.com> | Sat Apr 23 10:44:04 2011 +0100| [6ec00221dd253e70c698d57a6207a8363262561e] | committer: Jason Garrett-Glaser
MBAFF: Create a VSAD DSP function
x86 assembly by Jason Garrett-Glaser. This gives roughly 30x speed
increase over the C version.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6ec00221dd253e70c698d57a6207a8363262561e
---
common/pixel.c | 11 +++++++++
common/pixel.h | 1 +
common/x86/pixel.h | 2 +
common/x86/sad-a.asm | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++
tools/checkasm.c | 16 +++++++++++++
5 files changed, 89 insertions(+), 0 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index adc1eb2..58439af 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -641,6 +641,14 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
return ssim;
}
+int pixel_vsad( pixel *src, int stride )
+{
+ int score = 0;
+ for( int i = 1; i < 16; i++, src += stride )
+ for( int j = 0; j < 16; j++ )
+ score += abs(src[j] - src[j+stride]);
+ return score;
+}
/****************************************************************************
* successive elimination
@@ -746,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->var2_8x8 = pixel_var2_8x8;
+ pixf->vsad = pixel_vsad;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4;
@@ -873,6 +882,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmxext;
pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
+ pixf->vsad = x264_pixel_vsad_mmxext;
if( cpu&X264_CPU_CACHELINE_32 )
{
@@ -921,6 +931,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+ pixf->vsad = x264_pixel_vsad_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
diff --git a/common/pixel.h b/common/pixel.h
index 7b2f1b1..c464345 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -82,6 +82,7 @@ typedef struct
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+ int (*vsad)( pixel *, int );
int (*var2_8x8)( pixel *, int, pixel *, int, int * );
uint64_t (*var[4])( pixel *pix, int stride );
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 4e1b8c5..bc369ed 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -138,6 +138,8 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
int x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int x264_pixel_vsad_mmxext( pixel *src, int stride );
+int x264_pixel_vsad_sse2( pixel *src, int stride );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index e88f0c2..a214b47 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -273,6 +273,65 @@ cglobal pixel_sad_8x16_sse2, 4,4
RET
;-----------------------------------------------------------------------------
+; void pixel_vsad( pixel *src, int stride );
+;-----------------------------------------------------------------------------
+
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal pixel_vsad_mmxext, 2,3
+ mova m0, [r0+0]
+ mova m1, [r0+8]
+ mova m2, [r0+r1+0]
+ mova m3, [r0+r1+8]
+ lea r0, [r0+r1*2]
+ psadbw m0, m2
+ psadbw m1, m3
+ mov r2d, 7
+.loop:
+ mova m4, [r0+0]
+ mova m5, [r0+8]
+ psadbw m2, m4
+ psadbw m3, m5
+ paddw m0, m2
+ paddw m1, m3
+ mova m2, [r0+r1+0]
+ mova m3, [r0+r1+8]
+ lea r0, [r0+r1*2]
+ psadbw m4, m2
+ psadbw m5, m3
+ paddw m0, m4
+ paddw m1, m5
+ dec r2d
+ jg .loop
+ paddw m0, m1
+ movd eax, m0
+ RET
+%endif
+
+INIT_XMM
+cglobal pixel_vsad_sse2, 2,2
+ mova m1, [r0]
+%assign i 1
+%rep 15
+ mova m2, [r0+r1*(i&1)]
+%if i&1
+ lea r0, [r0+r1*2]
+%endif
+ psadbw m1, m2
+%if i>1
+ paddw m0, m1
+%else
+ SWAP 0, 1
+%endif
+ SWAP 1, 2
+%assign i i+1
+%endrep
+ movhlps m1, m0
+ paddw m0, m1
+ movd eax, m0
+ RET
+
+;-----------------------------------------------------------------------------
; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 6ba78e1..7c9bddc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -417,6 +417,22 @@ static int check_pixel( int cpu_ref, int cpu_new )
}
report( "pixel hadamard_ac :" );
+ ok = 1; used_asm = 0;
+ if( pixel_asm.vsad != pixel_ref.vsad )
+ {
+ int res_c, res_asm;
+ set_func_name( "vsad" );
+ used_asm = 1;
+ res_c = call_c( pixel_c.vsad, pbuf1, 16 );
+ res_asm = call_a( pixel_asm.vsad, pbuf1, 16 );
+ if( res_c != res_asm )
+ {
+ ok = 0;
+ fprintf( stderr, "vsad: %d != %d\n", res_c, res_asm );
+ }
+ }
+ report( "pixel vsad :" );
+
#define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
More information about the x264-devel
mailing list