[x264-devel] MBAFF: Create a VSAD DSP function

Simon Horlick git at videolan.org
Thu May 12 08:39:06 CEST 2011


x264 | branch: master | Simon Horlick <simonhorlick at gmail.com> | Sat Apr 23 10:44:04 2011 +0100| [6ec00221dd253e70c698d57a6207a8363262561e] | committer: Jason Garrett-Glaser

MBAFF: Create a VSAD DSP function

x86 assembly by Jason Garrett-Glaser. This gives roughly 30x speed
increase over the C version.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6ec00221dd253e70c698d57a6207a8363262561e
---

 common/pixel.c       |   11 +++++++++
 common/pixel.h       |    1 +
 common/x86/pixel.h   |    2 +
 common/x86/sad-a.asm |   59 ++++++++++++++++++++++++++++++++++++++++++++++++++
 tools/checkasm.c     |   16 +++++++++++++
 5 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index adc1eb2..58439af 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -641,6 +641,14 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
     return ssim;
 }
 
+int pixel_vsad( pixel *src, int stride )
+{
+    int score = 0;
+    for( int i = 1; i < 16; i++, src += stride )
+        for( int j = 0; j < 16; j++ )
+            score += abs(src[j] - src[j+stride]);
+    return score;
+}
 
 /****************************************************************************
  * successive elimination
@@ -746,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
     pixf->var2_8x8 = pixel_var2_8x8;
+    pixf->vsad = pixel_vsad;
 
     pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
     pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4;
@@ -873,6 +882,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmxext;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_mmxext;
         pixf->var2_8x8 = x264_pixel_var2_8x8_mmxext;
+        pixf->vsad = x264_pixel_vsad_mmxext;
 
         if( cpu&X264_CPU_CACHELINE_32 )
         {
@@ -921,6 +931,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
         pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+        pixf->vsad = x264_pixel_vsad_sse2;
     }
 
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
diff --git a/common/pixel.h b/common/pixel.h
index 7b2f1b1..c464345 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -82,6 +82,7 @@ typedef struct
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+    int (*vsad)( pixel *, int );
     int (*var2_8x8)( pixel *, int, pixel *, int, int * );
 
     uint64_t (*var[4])( pixel *pix, int stride );
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 4e1b8c5..bc369ed 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -138,6 +138,8 @@ float x264_pixel_ssim_end4_avx( int sum0[5][4], int sum1[5][4], int width );
 int  x264_pixel_var2_8x8_mmxext( pixel *, int, pixel *, int, int * );
 int  x264_pixel_var2_8x8_sse2( pixel *, int, pixel *, int, int * );
 int  x264_pixel_var2_8x8_ssse3( uint8_t *, int, uint8_t *, int, int * );
+int  x264_pixel_vsad_mmxext( pixel *src, int stride );
+int  x264_pixel_vsad_sse2( pixel *src, int stride );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index e88f0c2..a214b47 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -273,6 +273,65 @@ cglobal pixel_sad_8x16_sse2, 4,4
     RET
 
 ;-----------------------------------------------------------------------------
+; void pixel_vsad( pixel *src, int stride );
+;-----------------------------------------------------------------------------
+
+%ifndef ARCH_X86_64
+INIT_MMX
+cglobal pixel_vsad_mmxext, 2,3
+    mova      m0, [r0+0]
+    mova      m1, [r0+8]
+    mova      m2, [r0+r1+0]
+    mova      m3, [r0+r1+8]
+    lea       r0, [r0+r1*2]
+    psadbw    m0, m2
+    psadbw    m1, m3
+    mov      r2d, 7
+.loop:
+    mova      m4, [r0+0]
+    mova      m5, [r0+8]
+    psadbw    m2, m4
+    psadbw    m3, m5
+    paddw     m0, m2
+    paddw     m1, m3
+    mova      m2, [r0+r1+0]
+    mova      m3, [r0+r1+8]
+    lea       r0, [r0+r1*2]
+    psadbw    m4, m2
+    psadbw    m5, m3
+    paddw     m0, m4
+    paddw     m1, m5
+    dec      r2d
+    jg .loop
+    paddw     m0, m1
+    movd     eax, m0
+    RET
+%endif
+
+INIT_XMM
+cglobal pixel_vsad_sse2, 2,2
+    mova      m1, [r0]
+%assign i 1
+%rep 15
+    mova      m2, [r0+r1*(i&1)]
+%if i&1
+    lea       r0, [r0+r1*2]
+%endif
+    psadbw    m1, m2
+%if i>1
+    paddw     m0, m1
+%else
+    SWAP       0, 1
+%endif
+    SWAP       1, 2
+%assign i i+1
+%endrep
+    movhlps   m1, m0
+    paddw     m0, m1
+    movd     eax, m0
+    RET
+
+;-----------------------------------------------------------------------------
 ; void intra_sad_x3_4x4( uint8_t *fenc, uint8_t *fdec, int res[3] );
 ;-----------------------------------------------------------------------------
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 6ba78e1..7c9bddc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -417,6 +417,22 @@ static int check_pixel( int cpu_ref, int cpu_new )
         }
     report( "pixel hadamard_ac :" );
 
+    ok = 1; used_asm = 0;
+    if( pixel_asm.vsad != pixel_ref.vsad )
+    {
+        int res_c, res_asm;
+        set_func_name( "vsad" );
+        used_asm = 1;
+        res_c   = call_c( pixel_c.vsad,   pbuf1, 16 );
+        res_asm = call_a( pixel_asm.vsad, pbuf1, 16 );
+        if( res_c != res_asm )
+        {
+            ok = 0;
+            fprintf( stderr, "vsad: %d != %d\n", res_c, res_asm );
+        }
+    }
+    report( "pixel vsad :" );
+
 #define TEST_INTRA_MBCMP( name, pred, satd, i8x8, ... ) \
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \



More information about the x264-devel mailing list