[x264-devel] commit: interleave psnr/ssim computation with reference frame filtering, to improve cache coherency (Loren Merritt )

Wed Jun 18 15:47:18 CEST 2008

x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Thu Jun 12 01:39:22 2008 -0600| [22d3c0409deec7601292c56c7cd0a23427dbc107]

interleave psnr/ssim computation with reference frame filtering, to improve cache coherency

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=22d3c0409deec7601292c56c7cd0a23427dbc107
---

 common/common.h   |    3 +++
 common/pixel.c    |    2 +-
 encoder/encoder.c |   51 +++++++++++++++++++++++++++++++++++----------------
 tools/checkasm.c  |    2 +-
 4 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/common/common.h b/common/common.h
index 04f5243..0636394 100644
--- a/common/common.h
+++ b/common/common.h
@@ -544,6 +544,9 @@ struct x264_t
             int i_mbs_analysed;
             /* Adaptive direct mv pred */
             int i_direct_score[2];
+            /* Metrics */
+            int64_t i_ssd[3];
+            double f_ssim;
         } frame;
 
         /* Cumulated stats */
diff --git a/common/pixel.c b/common/pixel.c
index 11d74a0..64a410e 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -441,7 +441,7 @@ float x264_pixel_ssim_wxh( x264_pixel_function_t *pf,
     }
     x264_free(sum0);
     x264_free(sum1);
-    return ssim / ((width-1) * (height-1));
+    return ssim;
 }
 
 
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 533e8a8..f2710ab 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -893,6 +893,7 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     int b_deblock = !h->sh.i_disable_deblocking_filter_idc;
     int b_end = mb_y == h->sps->i_mb_height;
     int min_y = mb_y - (1 << h->sh.b_mbaff);
+    int max_y = b_end ? h->sps->i_mb_height : mb_y;
     b_deblock &= b_hpel || h->param.psz_dump_yuv;
     if( mb_y & h->sh.b_mbaff )
         return;
@@ -913,7 +914,6 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
 
     if( b_deblock )
     {
-        int max_y = b_end ? h->sps->i_mb_height : mb_y;
         int y;
         for( y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
             x264_frame_deblock_row( h, y );
@@ -930,6 +930,33 @@ static void x264_fdec_filter_row( x264_t *h, int mb_y )
     {
         x264_frame_cond_broadcast( h->fdec, mb_y*16 + (b_end ? 10000 : -(X264_THREAD_HEIGHT << h->sh.b_mbaff)) );
     }
+
+    min_y = X264_MAX( min_y*16-8, 0 );
+    max_y = b_end ? h->param.i_height : mb_y*16-8;
+
+    if( h->param.analyse.b_psnr )
+    {
+        int i;
+        for( i=0; i<3; i++ )
+            h->stat.frame.i_ssd[i] +=
+                x264_pixel_ssd_wxh( &h->pixf,
+                    h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
+                    h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
+                    h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+    }
+
+    if( h->param.analyse.b_ssim )
+    {
+        x264_emms();
+        /* offset by 2 pixels to avoid alignment of ssim blocks with dct blocks,
+         * and overlap by 4 */
+        min_y += min_y == 0 ? 2 : -6;
+        h->stat.frame.f_ssim +=
+            x264_pixel_ssim_wxh( &h->pixf,
+                h->fdec->plane[0] + 2+min_y*h->fdec->i_stride[0], h->fdec->i_stride[0],
+                h->fenc->plane[0] + 2+min_y*h->fenc->i_stride[0], h->fenc->i_stride[0],
+                h->param.i_width-2, max_y-min_y );
+    }
 }
 
 static inline void x264_reference_update( x264_t *h )
@@ -1659,16 +1686,11 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     psz_message[0] = '\0';
     if( h->param.analyse.b_psnr )
     {
-        int64_t sqe[3];
-
-        for( i=0; i<3; i++ )
-        {
-            sqe[i] = x264_pixel_ssd_wxh( &h->pixf,
-                         h->fdec->plane[i], h->fdec->i_stride[i],
-                         h->fenc->plane[i], h->fenc->i_stride[i],
-                         h->param.i_width >> !!i, h->param.i_height >> !!i );
-        }
-        x264_emms();
+        int64_t sqe[3] = {
+            h->stat.frame.i_ssd[0],
+            h->stat.frame.i_ssd[1],
+            h->stat.frame.i_ssd[2],
+        };
 
         h->stat.i_sqe_global[h->sh.i_type] += sqe[0] + sqe[1] + sqe[2];
         h->stat.f_psnr_average[h->sh.i_type] += x264_psnr( sqe[0] + sqe[1] + sqe[2], 3 * h->param.i_width * h->param.i_height / 2 );
@@ -1684,11 +1706,8 @@ static void x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
 
     if( h->param.analyse.b_ssim )
     {
-        // offset by 2 pixels to avoid alignment of ssim blocks with dct blocks
-        float ssim_y = x264_pixel_ssim_wxh( &h->pixf,
-                         h->fdec->plane[0] + 2+2*h->fdec->i_stride[0], h->fdec->i_stride[0],
-                         h->fenc->plane[0] + 2+2*h->fenc->i_stride[0], h->fenc->i_stride[0],
-                         h->param.i_width-2, h->param.i_height-2 );
+        double ssim_y = h->stat.frame.f_ssim
+                      / (((h->param.i_width-6)>>2) * ((h->param.i_height-6)>>2));
         h->stat.f_ssim_mean_y[h->sh.i_type] += ssim_y;
         snprintf( psz_message + strlen(psz_message), 80 - strlen(psz_message),
                   " SSIM Y:%.5f", ssim_y );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index c2c1661..b9e3205 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -314,7 +314,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
         x264_emms();
         res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28 );
         res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
-        if( fabs(res_c - res_a) > 1e-7 )
+        if( fabs(res_c - res_a) > 1e-6 )
         {
             ok = 0;
             fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );