diff --git a/common/pixel.c b/common/pixel.c old mode 100644 new mode 100755 index afbf309..3881b67 --- a/common/pixel.c +++ b/common/pixel.c @@ -153,6 +153,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size, /**************************************************************************** + * pixel_var_wxh + ****************************************************************************/ +#define PIXEL_VAR_C( name, w, shift ) \ +static int name( uint8_t *pix, int i_stride, unsigned int *sad ) \ +{ \ + unsigned int var = 0, sum = 0, sqr = 0; \ + int x, y; \ + for( y = 0; y < w; y++ ) \ + { \ + for( x = 0; x < w; x++ ) \ + { \ + sum += pix[x]; \ + sqr += pix[x] * pix[x]; \ + } \ + pix += i_stride; \ + } \ + var = sqr - (sum * sum >> shift); \ + *sad = sum; \ + return var; \ +} + +PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 ) +PIXEL_VAR_C( x264_pixel_var_8x8, 8, 6 ) + + + +/**************************************************************************** * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences ****************************************************************************/ static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height ) @@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT4( sa8d, ); INIT_ADS( ); + pixf->var[0] = x264_pixel_var_16x16; + pixf->var[1] = x264_pixel_var_8x8; + pixf->ssim_4x4x2_core = ssim_4x4x2_core; pixf->ssim_end4 = ssim_end4; @@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT7( satd_x3, _mmxext ); INIT7( satd_x4, _mmxext ); INIT_ADS( _mmxext ); - + pixf->var[0] = x264_pixel_var_16x16_mmxext; + pixf->var[1] = x264_pixel_var_8x8_mmxext; #ifdef ARCH_X86 pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext; @@ -592,7 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) INIT2( sad_x3, _sse2 ); INIT2( sad_x4, _sse2 ); INIT_ADS( _sse2 ); - + pixf->var[1] = x264_pixel_var_8x8_sse2; #ifdef ARCH_X86 if( cpu&X264_CPU_CACHELINE_64 ) { @@ -612,6 +643,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf ) pixf->ssim_end4 = x264_pixel_ssim_end4_sse2; pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2; pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse2; + pixf->var[0] = x264_pixel_var_16x16_sse2; #ifdef ARCH_X86_64 pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2; #endif diff --git a/common/pixel.h b/common/pixel.h old mode 100644 new mode 100755 index 3d94bf0..ef2e34c --- a/common/pixel.h +++ b/common/pixel.h @@ -26,6 +26,7 @@ typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int ); typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] ); typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] ); +typedef int (*x264_pixel_var_t) ( uint8_t *, int, unsigned int * ); enum { @@ -71,6 +72,7 @@ typedef struct x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */ x264_pixel_cmp_x3_t fpelcmp_x3[7]; x264_pixel_cmp_x4_t fpelcmp_x4[7]; + x264_pixel_var_t var[2]; void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1, const uint8_t *pix2, int stride2, int sums[2][4] ); diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm old mode 100644 new mode 100755 index dd398e3..9a8ba73 --- a/common/x86/pixel-a.asm +++ b/common/x86/pixel-a.asm @@ -164,6 +164,84 @@ SSD 8, 4, sse2 ;============================================================================= +; variance +;============================================================================= + +%macro VAR_START 0 + pxor m5, m5 ; sum + pxor m6, m6 ; sum squared + pxor m7, m7 ; zero +%endmacro + +%macro VAR_END 1 + HADDW m5, m0 ; sqr - sum * sum >> shift + movd [r2], m5 + pmuludq m5, m5 + psrlq m5, %1 + HADDD m6, m1 + psubd m6, m5 + movd r0, m6 + ret +%endmacro + +%macro VAR_ROW_REGSIZE 1 + mova m0, [r0 + %1] + mova m1, m0 + punpcklbw m0, m7 + punpckhbw m1, m7 + paddw m5, m0 + paddw m5, m1 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m6, m0 + paddd m6, m1 +%endmacro + +;----------------------------------------------------------------------------- +; int x264_pixel_var_wxh_sse2( uint8_t *, int, int * ) +;----------------------------------------------------------------------------- +INIT_XMM +cglobal x264_pixel_var_16x16_sse2, 3, 3 + VAR_START +%rep 16 + VAR_ROW_REGSIZE 0 + add r0, r1 +%endrep + VAR_END 8 + + +cglobal x264_pixel_var_8x8_sse2, 3, 3 + VAR_START +%rep 8 + movh m0, [r0] + punpcklbw m0, m7 + paddw m5, m0 + pmaddwd m0, m0 + paddd m6, m0 + add r0, r1 +%endrep + VAR_END 6 + +INIT_MMX +cglobal x264_pixel_var_16x16_mmxext, 3, 3 + VAR_START +%rep 16 + VAR_ROW_REGSIZE 0 + VAR_ROW_REGSIZE 8 + add r0, r1 +%endrep + VAR_END 8 + + +cglobal x264_pixel_var_8x8_mmxext, 3, 3 + VAR_START +%rep 8 + VAR_ROW_REGSIZE 0 + add r0, r1 +%endrep + VAR_END 6 + +;============================================================================= ; SATD ;============================================================================= diff --git a/common/x86/pixel.h b/common/x86/pixel.h old mode 100644 new mode 100755 index 491b921..2d092f8 --- a/common/x86/pixel.h +++ b/common/x86/pixel.h @@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 ); #undef DECL_X1 #undef DECL_X4 +int x264_pixel_var_16x16_mmxext( uint8_t *pix, int i_stride, unsigned int *sad ); +int x264_pixel_var_16x16_sse2 ( uint8_t *pix, int i_stride, unsigned int *sad ); +int x264_pixel_var_8x8_mmxext ( uint8_t *pix, int i_stride, unsigned int *sad ); +int x264_pixel_var_8x8_sse2 ( uint8_t *pix, int i_stride, unsigned int *sad ); + void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * ); void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * ); diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c old mode 100644 new mode 100755 index fc60c02..03f0b91 --- a/encoder/ratecontrol.c +++ b/encoder/ratecontrol.c @@ -199,9 +199,13 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd ) : w * (mb_x + mb_y * stride); int pix = i ? PIXEL_8x8 : PIXEL_16x16; stride <<= h->mb.b_interlaced; +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) //only x86 has optimized var + var += h->pixf.var[i ? 1 : 0]( h->fenc->plane[i]+offset, stride, &sad ); +#else sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride ); var += ssd - (sad * sad >> (i?6:8)); +#endif // SATD to represent the block's overall complexity (bit cost) for intra encoding. // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost. if( var && satd )