diff --git a/common/pixel.c b/common/pixel.c
old mode 100644
new mode 100755
index afbf309..3881b67
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -153,6 +153,33 @@ static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 
 
 /****************************************************************************
+ * pixel_var_wxh
+ ****************************************************************************/
+#define PIXEL_VAR_C( name, w, shift ) \
+static int name( uint8_t *pix, int i_stride, unsigned int *sad ) \
+{                                             \
+    unsigned int var = 0, sum = 0, sqr = 0;   \
+    int x, y;                                 \
+    for( y = 0; y < w; y++ )                  \
+    {                                         \
+        for( x = 0; x < w; x++ )              \
+        {                                     \
+            sum += pix[x];                    \
+            sqr += pix[x] * pix[x];           \
+        }                                     \
+        pix += i_stride;                      \
+    }                                         \
+    var = sqr - (sum * sum >> shift);         \
+    *sad = sum;                               \
+    return var;                               \
+}
+
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 8 )
+PIXEL_VAR_C( x264_pixel_var_8x8,    8, 6 )
+
+
+
+/****************************************************************************
  * pixel_satd_WxH: sum of 4x4 Hadamard transformed differences
  ****************************************************************************/
 static int pixel_satd_wxh( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2, int i_width, int i_height )
@@ -532,6 +559,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     INIT4( sa8d, );
     INIT_ADS( );
 
+    pixf->var[0] = x264_pixel_var_16x16;
+    pixf->var[1] = x264_pixel_var_8x8;
+
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
 
@@ -550,7 +580,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( satd_x3, _mmxext );
         INIT7( satd_x4, _mmxext );
         INIT_ADS( _mmxext );
-
+        pixf->var[0] = x264_pixel_var_16x16_mmxext;
+        pixf->var[1] = x264_pixel_var_8x8_mmxext;
 #ifdef ARCH_X86
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_mmxext;
@@ -592,7 +623,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT2( sad_x3, _sse2 );
         INIT2( sad_x4, _sse2 );
         INIT_ADS( _sse2 );
-
+        pixf->var[1] = x264_pixel_var_8x8_sse2;
 #ifdef ARCH_X86
         if( cpu&X264_CPU_CACHELINE_64 )
         {
@@ -612,6 +643,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->ssim_end4        = x264_pixel_ssim_end4_sse2;
         pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
         pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sse2;
+        pixf->var[0] = x264_pixel_var_16x16_sse2;
 #ifdef ARCH_X86_64
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
 #endif
diff --git a/common/pixel.h b/common/pixel.h
old mode 100644
new mode 100755
index 3d94bf0..ef2e34c
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -26,6 +26,7 @@
 typedef int  (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
 typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
 typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
+typedef int  (*x264_pixel_var_t) ( uint8_t *, int, unsigned int * );
 
 enum
 {
@@ -71,6 +72,7 @@ typedef struct
     x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
     x264_pixel_cmp_x3_t fpelcmp_x3[7];
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
+    x264_pixel_var_t var[2];
 
     void (*ssim_4x4x2_core)( const uint8_t *pix1, int stride1,
                              const uint8_t *pix2, int stride2, int sums[2][4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
old mode 100644
new mode 100755
index dd398e3..9a8ba73
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -164,6 +164,84 @@ SSD  8,  4, sse2
 
 
 ;=============================================================================
+; variance
+;=============================================================================
+
+%macro VAR_START 0
+    pxor  m5, m5    ; sum
+    pxor  m6, m6    ; sum squared
+    pxor  m7, m7    ; zero
+%endmacro
+
+%macro VAR_END 1
+    HADDW   m5, m0  ; sqr - sum * sum >> shift
+    movd  [r2], m5
+    pmuludq m5, m5
+    psrlq   m5, %1
+    HADDD   m6, m1
+    psubd   m6, m5
+    movd  r0, m6
+    ret
+%endmacro
+
+%macro VAR_ROW_REGSIZE 1
+    mova  m0, [r0 + %1]
+    mova  m1, m0
+    punpcklbw m0, m7
+    punpckhbw m1, m7
+    paddw   m5, m0
+    paddw   m5, m1
+    pmaddwd m0, m0
+    pmaddwd m1, m1
+    paddd   m6, m0
+    paddd   m6, m1
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int x264_pixel_var_wxh_sse2( uint8_t *, int, int * )
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal x264_pixel_var_16x16_sse2, 3, 3
+    VAR_START
+%rep 16
+    VAR_ROW_REGSIZE 0
+    add   r0, r1
+%endrep
+    VAR_END 8
+
+
+cglobal x264_pixel_var_8x8_sse2, 3, 3
+    VAR_START
+%rep 8
+    movh    m0, [r0]
+    punpcklbw m0, m7
+    paddw   m5, m0
+    pmaddwd m0, m0
+    paddd   m6, m0
+    add   r0, r1
+%endrep
+    VAR_END 6
+
+INIT_MMX
+cglobal x264_pixel_var_16x16_mmxext, 3, 3
+    VAR_START
+%rep 16
+    VAR_ROW_REGSIZE 0
+    VAR_ROW_REGSIZE 8
+    add   r0, r1
+%endrep
+    VAR_END 8
+
+
+cglobal x264_pixel_var_8x8_mmxext, 3, 3
+    VAR_START
+%rep 8
+    VAR_ROW_REGSIZE 0
+    add   r0, r1
+%endrep
+    VAR_END 6
+
+;=============================================================================
 ; SATD
 ;=============================================================================
 
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
old mode 100644
new mode 100755
index 491b921..2d092f8
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -68,6 +68,11 @@ DECL_X4( sad, cache64_ssse3 );
 #undef DECL_X1
 #undef DECL_X4
 
+int x264_pixel_var_16x16_mmxext( uint8_t *pix, int i_stride, unsigned int *sad );
+int x264_pixel_var_16x16_sse2  ( uint8_t *pix, int i_stride, unsigned int *sad );
+int x264_pixel_var_8x8_mmxext  ( uint8_t *pix, int i_stride, unsigned int *sad );
+int x264_pixel_var_8x8_sse2    ( uint8_t *pix, int i_stride, unsigned int *sad );
+
 void x264_intra_satd_x3_4x4_mmxext( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_4x4_ssse3( uint8_t *, uint8_t *, int * );
 void x264_intra_satd_x3_8x8c_mmxext( uint8_t *, uint8_t *, int * );
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
old mode 100644
new mode 100755
index fc60c02..03f0b91
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -199,9 +199,13 @@ static NOINLINE int ac_energy_mb( x264_t *h, int mb_x, int mb_y, int *satd )
                 : w * (mb_x + mb_y * stride);
             int pix = i ? PIXEL_8x8 : PIXEL_16x16;
             stride <<= h->mb.b_interlaced;
+#if (defined(ARCH_X86) || defined(ARCH_X86_64)) //only x86 has optimized var
+            var += h->pixf.var[i ? 1 : 0]( h->fenc->plane[i]+offset, stride, &sad );
+#else 
             sad = h->pixf.sad[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
             ssd = h->pixf.ssd[pix]( zero, 0, h->fenc->plane[i]+offset, stride );
             var += ssd - (sad * sad >> (i?6:8));
+#endif
             // SATD to represent the block's overall complexity (bit cost) for intra encoding.
             // exclude the DC coef, because nothing short of an actual intra prediction will estimate DC cost.
             if( var && satd )