[x264-devel] Faster chroma weight cost calculation

Wed Apr 25 00:17:10 CEST 2012

x264 | branch: master | Henrik Gramner <hengar-6 at student.ltu.se> | Sat Apr  7 00:40:09 2012 +0200| [df6252cfed7c23fbe883456f4e0607a7f8e91ad8] | committer: Jason Garrett-Glaser

Faster chroma weight cost calculation

New assembly function with SSE2, SSSE3 and XOP implementations for calculating absolute sum of differences.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=df6252cfed7c23fbe883456f4e0607a7f8e91ad8
---

 common/pixel.c         |   15 ++++++++++
 common/pixel.h         |    1 +
 common/x86/pixel-a.asm |   67 ++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/pixel.h     |    3 ++
 encoder/slicetype.c    |   15 +++--------
 tools/checkasm.c       |   15 ++++++++++
 6 files changed, 105 insertions(+), 11 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 9f52c79..d160921 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -703,6 +703,15 @@ int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
     return (score_field < score_frame);
 }
 
+static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height )
+{
+    int sum = 0;
+    for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 )
+        for( int x = 0; x < 8; x++ )
+            sum += pix1[x] - pix2[x];
+    return abs( sum );
+}
+
 /****************************************************************************
  * successive elimination
  ****************************************************************************/
@@ -814,6 +823,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->ssim_4x4x2_core = ssim_4x4x2_core;
     pixf->ssim_end4 = ssim_end4;
     pixf->vsad = pixel_vsad;
+    pixf->asd8 = pixel_asd8;
 
     pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4;
     pixf->intra_satd_x3_4x4   = x264_intra_satd_x3_4x4;
@@ -888,6 +898,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT4( hadamard_ac, _sse2 );
         }
         pixf->vsad = x264_pixel_vsad_sse2;
+        pixf->asd8 = x264_pixel_asd8_sse2;
         pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_sse2;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_sse2;
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_sse2;
@@ -915,6 +926,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
             INIT4( hadamard_ac, _ssse3 );
         }
         pixf->vsad = x264_pixel_vsad_ssse3;
+        pixf->asd8 = x264_pixel_asd8_ssse3;
         pixf->intra_sad_x3_4x4  = x264_intra_sad_x3_4x4_ssse3;
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
@@ -951,6 +963,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     if( cpu&X264_CPU_XOP )
     {
         pixf->vsad = x264_pixel_vsad_xop;
+        pixf->asd8 = x264_pixel_asd8_xop;
     }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
@@ -1035,6 +1048,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->var2[PIXEL_8x8]   = x264_pixel_var2_8x8_sse2;
         pixf->var2[PIXEL_8x16]  = x264_pixel_var2_8x16_sse2;
         pixf->vsad = x264_pixel_vsad_sse2;
+        pixf->asd8 = x264_pixel_asd8_sse2;
     }
 
     if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -1126,6 +1140,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
         pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
+        pixf->asd8 = x264_pixel_asd8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
diff --git a/common/pixel.h b/common/pixel.h
index 1375b90..90d1b4b 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -89,6 +89,7 @@ typedef struct
     x264_pixel_cmp_x4_t fpelcmp_x4[7];
     x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
     int (*vsad)( pixel *, intptr_t, int );
+    int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 
     uint64_t (*var[4])( pixel *pix, intptr_t stride );
     int (*var2[4])( pixel *pix1, intptr_t stride1,
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index b880323..40872d2 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4001,6 +4001,73 @@ SSIM
 INIT_XMM avx
 SSIM
 
+;-----------------------------------------------------------------------------
+; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+;-----------------------------------------------------------------------------
+%macro ASD8 0
+cglobal pixel_asd8, 5,5
+    pxor     m0, m0
+    pxor     m1, m1
+.loop:
+%if HIGH_BIT_DEPTH
+    paddw    m0, [r0]
+    paddw    m1, [r2]
+    paddw    m0, [r0+2*r1]
+    paddw    m1, [r2+2*r3]
+    lea      r0, [r0+4*r1]
+    paddw    m0, [r0]
+    paddw    m1, [r2+4*r3]
+    lea      r2, [r2+4*r3]
+    paddw    m0, [r0+2*r1]
+    paddw    m1, [r2+2*r3]
+    lea      r0, [r0+4*r1]
+    lea      r2, [r2+4*r3]
+%else
+    movq     m2, [r0]
+    movq     m3, [r2]
+    movhps   m2, [r0+r1]
+    movhps   m3, [r2+r3]
+    lea      r0, [r0+2*r1]
+    psadbw   m2, m1
+    psadbw   m3, m1
+    movq     m4, [r0]
+    movq     m5, [r2+2*r3]
+    lea      r2, [r2+2*r3]
+    movhps   m4, [r0+r1]
+    movhps   m5, [r2+r3]
+    lea      r0, [r0+2*r1]
+    paddw    m0, m2
+    psubw    m0, m3
+    psadbw   m4, m1
+    psadbw   m5, m1
+    lea      r2, [r2+2*r3]
+    paddw    m0, m4
+    psubw    m0, m5
+%endif
+    sub     r4d, 4
+    jg .loop
+%if HIGH_BIT_DEPTH
+    psubw    m0, m1
+    HADDW    m0, m1
+    ABSD     m1, m0
+%else
+    movhlps  m1, m0
+    paddw    m0, m1
+    ABSW     m1, m0
+%endif
+    movd    eax, m1
+    RET
+%endmacro
+
+INIT_XMM sse2
+ASD8
+INIT_XMM ssse3
+ASD8
+%if HIGH_BIT_DEPTH
+INIT_XMM xop
+ASD8
+%endif
+
 ;=============================================================================
 ; Successive Elimination ADS
 ;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 5f71c2a..c59e357 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -159,6 +159,9 @@ int  x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
 int  x264_pixel_vsad_xop  ( pixel *src, intptr_t stride, int height );
+int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+int x264_pixel_asd8_xop  ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
 
 #define DECL_ADS( size, suffix ) \
 int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 9bd862d..1aa4891 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -220,15 +220,12 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
 {
     unsigned int cost = 0;
     int i_stride = fenc->i_stride[1];
-    int i_offset = i_stride / 2;
     int i_lines = fenc->i_lines[1];
     int i_width = fenc->i_width[1];
-    pixel *src = ref + i_offset;
+    pixel *src = ref + (i_stride >> 1);
     ALIGNED_ARRAY_16( pixel, buf, [8*16] );
     int pixoff = 0;
-    int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
     int height = 16 >> CHROMA_V_SHIFT;
-    ALIGNED_16( static pixel flat[8] ) = {0};
     if( w )
     {
         for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
@@ -239,19 +236,15 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
                  * But testing shows that for chroma the DC coefficient is by far the most
                  * important part of the coding cost.  Thus a more useful chroma weight is
                  * obtained by comparing each block's DC coefficient instead of the actual
-                 * pixels.
-                 *
-                 * FIXME: add a (faster) asm sum function to replace sad. */
-                cost += abs( h->pixf.sad_aligned[chromapix](          buf,        8, flat, 0 ) -
-                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
+                 * pixels. */
+                cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height );
             }
         cost += x264_weight_slice_header_cost( h, w, 1 );
     }
     else
         for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
             for( int x = 0; x < i_width; x += 8, pixoff += 8 )
-                cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
-                             h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
+                cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height );
     x264_emms();
     return cost;
 }
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 630a01d..01e0dd3 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -475,6 +475,21 @@ static int check_pixel( int cpu_ref, int cpu_new )
     }
     report( "pixel vsad :" );
 
+    ok = 1; used_asm = 0;
+    if( pixel_asm.asd8 != pixel_ref.asd8 )
+    {
+        set_func_name( "asd8" );
+        used_asm = 1;
+        int res_c = call_c( pixel_c.asd8,   pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
+        int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
+        if( res_c != res_a )
+        {
+            ok = 0;
+            fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
+        }
+    }
+    report( "pixel asd :" );
+
 #define TEST_INTRA_X3( name, i8x8, ... ) \
     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
     { \