[x264-devel] commit: Add dedicated variance function instead of using SAD+SSD ( David Pethes )

Sun Aug 17 00:52:01 CEST 2008

x264 | branch: master | David Pethes <imcold at centrum.sk> | Sat Aug 16 09:43:26 2008 -0600| [d994e0bca73aab306eb41f84483b9e1a36feb9c7] | committer: Jason Garrett-Glaser 

Add dedicated variance function instead of using SAD+SSD
Faster variance calculation

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d994e0bca73aab306eb41f84483b9e1a36feb9c7
---

 common/pixel.c         |   35 +++++++++++++++-
 common/pixel.h         |    2 +
 common/x86/pixel-a.asm |  106 ++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/pixel.h     |    5 ++
 encoder/ratecontrol.c  |    6 +--
 tools/checkasm.c       |   21 +++++++++
 6 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index afbf309..27575b5 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -153,6 +153,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 
 
 /****************************************************************************
+ * pixel_var_wxh
+ ****************************************************************************/
+#define PIXEL_VAR_C( name, w, shift ) \
+static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+{                                             \
+    uint32_t var = 0, sum = 0, sqr = 0;       \
+    int x, y;                                 \
+    for( y = 0; y < w; y++ )                  \
+    {                                         \
+        for( x = 0; x < w; x++ )              \
+        {                                     \
+            sum += pix[x];                    \
+            sqr += pix[x] * pix[x];           \
+        }                                     \
+        pix += i_stride;                      \
+    }                                         \
+    var = sqr - (sum * sum >> shift);         \
+    *sad = sum;                               \
+    return var;                               \
+}
+
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+
+
+
+/****************************************************************************
  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
  ****************************************************************************/
 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
@@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     INIT4( sa8d, );
     INIT_ADS( );
 
+    pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+    pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8;
+
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
 
@@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( satd_x3, _mmxext );
         INIT7( satd_x4, _mmxext );
         INIT_ADS( _mmxext );
-
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_mmxext;
 #ifdef ARCH_X86
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
@@ -592,6 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT2( sad_x3, _sse2 );
         INIT2( sad_x4, _sse2 );
         INIT_ADS( _sse2 );
+        pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
 
 #ifdef ARCH_X86
         if( cpu&X264_CPU_CACHELINE_64 )
@@ -608,6 +640,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT5( satd, _sse2 );
         INIT5( satd_x3, _sse2 );
         INIT5( satd_x4, _sse2 );
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
         pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_sse2;
         pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
diff --git a/common/pixel.h b/common/pixel.h
index 3d94bf0..fd23680 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -26,6 +26,7 @@
 typedef int  (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
 typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
 typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
+typedef int  (*x264_pixel_var_t) ( uint8_t *, int, uint32_t * );
 
 enum
 {
@@ -71,6 +72,7 @@ typedef struct
     x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
+    x264_pixel_var_t var[4];
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
                              const uint8_t *pix2, int stride2, int sums[2][4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6b087ec..7816e9a 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -162,6 +162,112 @@ SSD  8,  8, sse2
 SSD  8,  4, sse2
 
 
+;=============================================================================
+; variance
+;=============================================================================
+
+%macro VAR_START 0
+    pxor  m5, m5    ; sum
+    pxor  m6, m6    ; sum squared
+    pxor  m7, m7    ; zero
+%ifdef ARCH_X86_64
+    %define t3d r3d
+%else
+    %define t3d r2d
+%endif
+%endmacro
+
+%macro VAR_END 1
+%if mmsize == 16
+    movhlps m0, m5
+    paddw   m5, m0
+%endif
+    movifnidn r2d, r2m
+    movd   r1d, m5
+    movd  [r2], m5  ; return sum
+    imul   r1d, r1d
+    HADDD   m6, m1
+    shr    r1d, %1
+    movd   eax, m6
+    sub    eax, r1d  ; sqr - (sum * sum >> shift)
+    RET
+%endmacro
+
+%macro VAR_2ROW 2
+    mov      t3d, %2
+.loop:
+    mova      m0, [r0]
+    mova      m1, m0
+    mova      m3, [r0+%1]
+    mova      m2, m0
+    punpcklbw m0, m7
+    mova      m4, m3
+    punpckhbw m1, m7
+%ifidn %1, r1
+    lea       r0, [r0+%1*2]
+%else
+    add       r0, r1
+%endif
+    punpckhbw m4, m7
+    psadbw    m2, m7
+    paddw     m5, m2
+    mova      m2, m3
+    punpcklbw m3, m7
+    dec t3d
+    psadbw    m2, m7
+    pmaddwd   m0, m0
+    paddw     m5, m2
+    pmaddwd   m1, m1
+    paddd     m6, m0
+    pmaddwd   m3, m3
+    paddd     m6, m1
+    pmaddwd   m4, m4
+    paddd     m6, m3
+    paddd     m6, m4
+    jg .loop
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_pixel_var_16x16_mmxext, 2,3
+    VAR_START
+    VAR_2ROW 8, 16
+    VAR_END 8
+
+cglobal x264_pixel_var_8x8_mmxext, 2,3
+    VAR_START
+    VAR_2ROW r1, 4
+    VAR_END 6
+
+INIT_XMM
+cglobal x264_pixel_var_16x16_sse2, 2,3
+    VAR_START
+    VAR_2ROW r1, 8
+    VAR_END 8
+
+cglobal x264_pixel_var_8x8_sse2, 2,3
+    VAR_START
+    mov t3d, 4
+.loop:
+    movh      m0, [r0]
+    movhps    m0, [r0+r1]
+    lea       r0, [r0+r1*2]
+    mova      m1, m0
+    punpcklbw m0, m7
+    mova      m2, m1
+    punpckhbw m1, m7
+    dec t3d
+    pmaddwd   m0, m0
+    pmaddwd   m1, m1
+    psadbw    m2, m7
+    paddw     m5, m2
+    paddd     m6, m0
+    paddd     m6, m1
+    jnz .loop
+    VAR_END 6
+
 
 ;=============================================================================
 ; SATD
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 491b921..9326a84 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 );
 #undef DECL_X1
 #undef DECL_X4
 
+int x264_pixel_var_16x16_mmxext( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_16x16_sse2  ( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_8x8_mmxext  ( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_8x8_sse2    ( uint8_t *pix, int i_stride, uint32_t *sad );
+
 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index f631a95..d00545d 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -187,7 +187,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
     /* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
     * array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
     DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
-    unsigned int var=0, sad, ssd, i;
+    unsigned int var=0, sad, i;
     if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
     {
         for( i=0; i<3; i++ )
@@ -199,9 +199,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
                 : w * (mb_x + mb_y * stride);
             int pix = i ? PIXEL_8x8 : PIXEL_16x16;
             stride <<= h->mb.b_interlaced;
-            sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
-            ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
-            var += ssd - (sad * sad >> (i?6:8));
+            var += h->pixf.var[pix]( h->fenc->plane[i]+offset, stride, &sad );
             // SATD to represent the block's overall complexity (bit cost) for intra encoding.
             // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
             if( var && satd )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ae6b7a6..4813abc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -302,6 +302,27 @@ static int check_pixel( int cpu_ref, int cpu_new )
     TEST_PIXEL_X(3);
     TEST_PIXEL_X(4);
 
+#define TEST_PIXEL_VAR( i ) \
+    if( pixel_asm.var[i] != pixel_ref.var[i] ) \
+    { \
+        uint32_t res_c, res_asm; \
+        uint32_t sad_c, sad_asm; \
+        set_func_name( "%s_%s", "var", pixel_names[i] ); \
+        used_asm = 1; \
+        res_c   = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
+        res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
+        if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+        { \
+            ok = 0; \
+            fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+        } \
+    }
+
+    ok = 1; used_asm = 0;
+    TEST_PIXEL_VAR( PIXEL_16x16 );
+    TEST_PIXEL_VAR( PIXEL_8x8 );
+    report( "pixel var :" );
+
 #define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \