[x264-devel] commit: Add dedicated variance function instead of using SAD+SSD ( David Pethes )
git version control
git at videolan.org
Sun Aug 17 00:52:01 CEST 2008
x264 | branch: master | David Pethes <imcold at centrum.sk> | Sat Aug 16 09:43:26 2008 -0600| [d994e0bca73aab306eb41f84483b9e1a36feb9c7] | committer: Jason Garrett-Glaser
Add dedicated variance function instead of using SAD+SSD
Faster variance calculation
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d994e0bca73aab306eb41f84483b9e1a36feb9c7
---
common/pixel.c | 35 +++++++++++++++-
common/pixel.h | 2 +
common/x86/pixel-a.asm | 106 ++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/pixel.h | 5 ++
encoder/ratecontrol.c | 6 +--
tools/checkasm.c | 21 +++++++++
6 files changed, 170 insertions(+), 5 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index afbf309..27575b5 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -153,6 +153,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
/****************************************************************************
+ * pixel_var_wxh
+ ****************************************************************************/
+#define PIXEL_VAR_C( name, w, shift ) \
+static int name( uint8_t *pix, int i_stride, uint32_t *sad ) \
+{ \
+ uint32_t var = 0, sum = 0, sqr = 0; \
+ int x, y; \
+ for( y = 0; y < w; y++ ) \
+ { \
+ for( x = 0; x < w; x++ ) \
+ { \
+ sum += pix[x]; \
+ sqr += pix[x] * pix[x]; \
+ } \
+ pix += i_stride; \
+ } \
+ var = sqr - (sum * sum >> shift); \
+ *sad = sum; \
+ return var; \
+}
+
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
+PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 )
+
+
+
+/****************************************************************************
* pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
****************************************************************************/
static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
@@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( sa8d, );
INIT_ADS( );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
+
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
@@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd_x3, _mmxext );
INIT7( satd_x4, _mmxext );
INIT_ADS( _mmxext );
-
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
#ifdef ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
@@ -592,6 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT_ADS( _sse2 );
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
#ifdef ARCH_X86
if( cpu&X264_CPU_CACHELINE_64 )
@@ -608,6 +640,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT5( satd, _sse2 );
INIT5( satd_x3, _sse2 );
INIT5( satd_x4, _sse2 );
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
diff --git a/common/pixel.h b/common/pixel.h
index 3d94bf0..fd23680 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -26,6 +26,7 @@
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
+typedef int (*x264_pixel_var_t) ( uint8_t *, int, uint32_t * );
enum
{
@@ -71,6 +72,7 @@ typedef struct
x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
+ x264_pixel_var_t var[4];
void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6b087ec..7816e9a 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -162,6 +162,112 @@ SSD 8, 8, sse2
SSD 8, 4, sse2
+;=============================================================================
+; variance
+;=============================================================================
+
+%macro VAR_START 0
+ pxor m5, m5 ; sum
+ pxor m6, m6 ; sum squared
+ pxor m7, m7 ; zero
+%ifdef ARCH_X86_64
+ %define t3d r3d
+%else
+ %define t3d r2d
+%endif
+%endmacro
+
+%macro VAR_END 1
+%if mmsize == 16
+ movhlps m0, m5
+ paddw m5, m0
+%endif
+ movifnidn r2d, r2m
+ movd r1d, m5
+ movd [r2], m5 ; return sum
+ imul r1d, r1d
+ HADDD m6, m1
+ shr r1d, %1
+ movd eax, m6
+ sub eax, r1d ; sqr - (sum * sum >> shift)
+ RET
+%endmacro
+
+%macro VAR_2ROW 2
+ mov t3d, %2
+.loop:
+ mova m0, [r0]
+ mova m1, m0
+ mova m3, [r0+%1]
+ mova m2, m0
+ punpcklbw m0, m7
+ mova m4, m3
+ punpckhbw m1, m7
+%ifidn %1, r1
+ lea r0, [r0+%1*2]
+%else
+ add r0, r1
+%endif
+ punpckhbw m4, m7
+ psadbw m2, m7
+ paddw m5, m2
+ mova m2, m3
+ punpcklbw m3, m7
+ dec t3d
+ psadbw m2, m7
+ pmaddwd m0, m0
+ paddw m5, m2
+ pmaddwd m1, m1
+ paddd m6, m0
+ pmaddwd m3, m3
+ paddd m6, m1
+ pmaddwd m4, m4
+ paddd m6, m3
+ paddd m6, m4
+ jg .loop
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var_wxh_mmxext( uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal x264_pixel_var_16x16_mmxext, 2,3
+ VAR_START
+ VAR_2ROW 8, 16
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_mmxext, 2,3
+ VAR_START
+ VAR_2ROW r1, 4
+ VAR_END 6
+
+INIT_XMM
+cglobal x264_pixel_var_16x16_sse2, 2,3
+ VAR_START
+ VAR_2ROW r1, 8
+ VAR_END 8
+
+cglobal x264_pixel_var_8x8_sse2, 2,3
+ VAR_START
+ mov t3d, 4
+.loop:
+ movh m0, [r0]
+ movhps m0, [r0+r1]
+ lea r0, [r0+r1*2]
+ mova m1, m0
+ punpcklbw m0, m7
+ mova m2, m1
+ punpckhbw m1, m7
+ dec t3d
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ psadbw m2, m7
+ paddw m5, m2
+ paddd m6, m0
+ paddd m6, m1
+ jnz .loop
+ VAR_END 6
+
;=============================================================================
; SATD
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 491b921..9326a84 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 );
#undef DECL_X1
#undef DECL_X4
+int x264_pixel_var_16x16_mmxext( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_16x16_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, uint32_t *sad );
+int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, uint32_t *sad );
+
void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index f631a95..d00545d 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -187,7 +187,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
/* FIXME: This array is larger than necessary because a bug in GCC causes an all-zero
* array to be placed in .bss despite .bss not being correctly aligned on some platforms (win32?) */
DECLARE_ALIGNED_16( static uint8_t zero[17] ) = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1};
- unsigned int var=0, sad, ssd, i;
+ unsigned int var=0, sad, i;
if( satd || h->param.rc.i_aq_mode == X264_AQ_GLOBAL )
{
for( i=0; i<3; i++ )
@@ -199,9 +199,7 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
: w * (mb_x + mb_y * stride);
int pix = i ? PIXEL_8x8 : PIXEL_16x16;
stride <<= h->mb.b_interlaced;
- sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
- ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
- var += ssd - (sad * sad >> (i?6:8));
+ var += h->pixf.var[pix]( h->fenc->plane[i]+offset, stride, &sad );
// SATD to represent the block's overall complexity (bit cost) for intra encoding.
// exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
if( var && satd )
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ae6b7a6..4813abc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -302,6 +302,27 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_PIXEL_X(3);
TEST_PIXEL_X(4);
+#define TEST_PIXEL_VAR( i ) \
+ if( pixel_asm.var[i] != pixel_ref.var[i] ) \
+ { \
+ uint32_t res_c, res_asm; \
+ uint32_t sad_c, sad_asm; \
+ set_func_name( "%s_%s", "var", pixel_names[i] ); \
+ used_asm = 1; \
+ res_c = call_c( pixel_c.var[i], buf1, 16, &sad_c ); \
+ res_asm = call_a( pixel_asm.var[i], buf1, 16, &sad_asm ); \
+ if( (res_c != res_asm) || (sad_c != sad_asm) ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, "var[%d]: %d,%d != %d,%d [FAILED]\n", i, res_c, sad_c, res_asm, sad_asm ); \
+ } \
+ }
+
+ ok = 1; used_asm = 0;
+ TEST_PIXEL_VAR( PIXEL_16x16 );
+ TEST_PIXEL_VAR( PIXEL_8x8 );
+ report( "pixel var :" );
+
#define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
More information about the x264-devel
mailing list