[x264-devel] commit: Faster, more accurate psy-RD caching (Anton Mitrofanov )

Tue Feb 23 11:05:35 CET 2010

x264 | branch: master | Anton Mitrofanov <BugMaster at narod.ru> | Fri Feb 19 10:45:22 2010 -0800| [7a7fbeab0f1f3b8187d1d8ce56def91f1582d5d7] | committer: Jason Garrett-Glaser 

Faster, more accurate psy-RD caching
Keep more variants of cached Hadamard scores and only calculate them when necessary.
Results in more calculation, but simpler lookups.
Slightly more accurate due to internal rounding in SATD and SA8D functions.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=7a7fbeab0f1f3b8187d1d8ce56def91f1582d5d7
---

 common/common.h      |    8 ++---
 common/x86/mc-a2.asm |    6 +++-
 encoder/analyse.c    |   39 ++++++---------------------
 encoder/rdo.c        |   69 ++++++++++++++++++++++++++++---------------------
 4 files changed, 55 insertions(+), 67 deletions(-)

diff --git a/common/common.h b/common/common.h
index 661eda6..18c172e 100644
--- a/common/common.h
+++ b/common/common.h
@@ -583,11 +583,9 @@ struct x264_t
             ALIGNED_16( int16_t fenc_dct8[4][64] );
             ALIGNED_16( int16_t fenc_dct4[16][16] );
 
-            /* Psy RD SATD scores */
-            int fenc_satd[4][4];
-            int fenc_satd_sum;
-            int fenc_sa8d[2][2];
-            int fenc_sa8d_sum;
+            /* Psy RD SATD/SA8D scores cache */
+            ALIGNED_16( uint64_t fenc_hadamard_cache[9] );
+            ALIGNED_16( uint32_t fenc_satd_cache[32] );
 
             /* pointer over mb of the frame to be compressed */
             uint8_t *p_fenc[3];
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index f2e69c0..d86d6ef 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -731,15 +731,17 @@ cglobal x264_memcpy_aligned_sse2, 3,3
 ;-----------------------------------------------------------------------------
 %macro MEMZERO 1
 cglobal x264_memzero_aligned_%1, 2,2
+    add  r0, r1
+    neg  r1
     pxor m0, m0
 .loop:
-    sub r1d, mmsize*8
 %assign i 0
 %rep 8
     mova [r0 + r1 + i], m0
 %assign i i+mmsize
 %endrep
-    jg .loop
+    add r1d, mmsize*8
+    jl .loop
     REP_RET
 %endmacro
 
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 6ee5f8e..02fbf7c 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -578,34 +578,13 @@ static void inline x264_psy_trellis_init( x264_t *h, int do_both_dct )
         h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
 }
 
-/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */
-static inline void x264_mb_cache_fenc_satd( x264_t *h )
+/* Reset fenc satd scores cache for psy RD */
+static inline void x264_mb_init_fenc_cache( x264_t *h, int b_satd )
 {
-    ALIGNED_16( static uint8_t zero[16] ) = {0};
-    uint8_t *fenc;
-    int x, y, satd_sum = 0, sa8d_sum = 0;
-    if( h->param.analyse.i_trellis == 2 && h->mb.i_psy_trellis )
-        x264_psy_trellis_init( h, h->param.analyse.b_transform_8x8 );
-    if( !h->mb.i_psy_rd )
-        return;
-    for( y = 0; y < 4; y++ )
-        for( x = 0; x < 4; x++ )
-        {
-            fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE;
-            h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )
-                                      - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1);
-            satd_sum += h->mb.pic.fenc_satd[y][x];
-        }
-    for( y = 0; y < 2; y++ )
-        for( x = 0; x < 2; x++ )
-        {
-            fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE;
-            h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )
-                                      - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2);
-            sa8d_sum += h->mb.pic.fenc_sa8d[y][x];
-        }
-    h->mb.pic.fenc_satd_sum = satd_sum;
-    h->mb.pic.fenc_sa8d_sum = sa8d_sum;
+    /* Writes beyond the end of the array, but not a problem since fenc_satd_cache is right after. */
+    h->mc.memzero_aligned( h->mb.pic.fenc_hadamard_cache, sizeof(h->mb.pic.fenc_hadamard_cache) );
+    if( b_satd )
+        h->mc.memzero_aligned( h->mb.pic.fenc_satd_cache, sizeof(h->mb.pic.fenc_satd_cache) );
 }
 
 static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
@@ -1193,7 +1172,7 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a )
     h->mb.i_type = P_L0;
     if( a->i_mbrd )
     {
-        x264_mb_cache_fenc_satd( h );
+        x264_mb_init_fenc_cache( h, a->i_mbrd >= 2 || h->param.analyse.inter & X264_ANALYSE_PSUB8x8 );
         if( a->l0.me16x16.i_ref == 0 && M32( a->l0.me16x16.mv ) == M32( h->mb.cache.pskip_mv ) && !a->b_force_intra )
         {
             h->mb.i_partition = D_16x16;
@@ -2432,7 +2411,7 @@ void x264_macroblock_analyse( x264_t *h )
     {
 intra_analysis:
         if( analysis.i_mbrd )
-            x264_mb_cache_fenc_satd( h );
+            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
         x264_mb_analyse_intra( h, &analysis, COST_MAX );
         if( analysis.i_mbrd )
             x264_intra_rd( h, &analysis, COST_MAX );
@@ -2749,7 +2728,7 @@ intra_analysis:
         int b_skip = 0;
 
         if( analysis.i_mbrd )
-            x264_mb_cache_fenc_satd( h );
+            x264_mb_init_fenc_cache( h, analysis.i_mbrd >= 2 );
 
         h->mb.i_type = B_SKIP;
         if( h->mb.b_direct_auto_write )
diff --git a/encoder/rdo.c b/encoder/rdo.c
index e15f47d..fed2a28 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -61,36 +61,44 @@ static uint16_t cabac_size_5ones[128];
 #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \
         sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) )
 
-
-/* Sum the cached SATDs to avoid repeating them. */
-static inline int sum_satd( x264_t *h, int pixel, int x, int y )
+static inline uint64_t cached_hadamard( x264_t *h, int pixel, int x, int y )
 {
-    int satd = 0;
-    int min_x = x>>2;
-    int min_y = y>>2;
-    int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2);
-    int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2);
-    if( pixel == PIXEL_16x16 )
-        return h->mb.pic.fenc_satd_sum;
-    for( y = min_y; y < max_y; y++ )
-        for( x = min_x; x < max_x; x++ )
-            satd += h->mb.pic.fenc_satd[y][x];
-    return satd;
+    static const uint8_t hadamard_shift_x[4] = {4,   4,   3,   3};
+    static const uint8_t hadamard_shift_y[4] = {4-0, 3-0, 4-1, 3-1};
+    static const uint8_t  hadamard_offset[4] = {0,   1,   3,   5};
+    int cache_index = (x >> hadamard_shift_x[pixel]) + (y >> hadamard_shift_y[pixel])
+                    + hadamard_offset[pixel];
+    uint64_t res = h->mb.pic.fenc_hadamard_cache[cache_index];
+    if( res )
+        return res - 1;
+    else
+    {
+        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+        res = h->pixf.hadamard_ac[pixel]( fenc, FENC_STRIDE );
+        h->mb.pic.fenc_hadamard_cache[cache_index] = res + 1;
+        return res;
+    }
 }
 
-static inline int sum_sa8d( x264_t *h, int pixel, int x, int y )
+static inline int cached_satd( x264_t *h, int pixel, int x, int y )
 {
-    int sa8d = 0;
-    int min_x = x>>3;
-    int min_y = y>>3;
-    int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3);
-    int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3);
-    if( pixel == PIXEL_16x16 )
-        return h->mb.pic.fenc_sa8d_sum;
-    for( y = min_y; y < max_y; y++ )
-        for( x = min_x; x < max_x; x++ )
-            sa8d += h->mb.pic.fenc_sa8d[y][x];
-    return sa8d;
+    static const uint8_t satd_shift_x[3] = {3,   2,   2};
+    static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
+    static const uint8_t  satd_offset[3] = {0,   8,   16};
+    ALIGNED_16( static uint8_t zero[16] );
+    int cache_index = (x >> satd_shift_x[pixel - PIXEL_8x4]) + (y >> satd_shift_y[pixel - PIXEL_8x4])
+                    + satd_offset[pixel - PIXEL_8x4];
+    int res = h->mb.pic.fenc_satd_cache[cache_index];
+    if( res )
+        return res - 1;
+    else
+    {
+        uint8_t *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
+        int dc = h->pixf.sad[pixel]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
+        res = h->pixf.satd[pixel]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+        h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
+        return res;
+    }
 }
 
 /* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */
@@ -113,15 +121,16 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
         /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */
         if( size <= PIXEL_8x8 )
         {
-            uint64_t acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
-            satd = abs((int32_t)acs - sum_satd( h, size, x, y ))
-                 + abs((int32_t)(acs>>32) - sum_sa8d( h, size, x, y ));
+            uint64_t fdec_acs = h->pixf.hadamard_ac[size]( fdec, FDEC_STRIDE );
+            uint64_t fenc_acs = cached_hadamard( h, size, x, y );
+            satd = abs((int32_t)fdec_acs - (int32_t)fenc_acs)
+                 + abs((int32_t)(fdec_acs>>32) - (int32_t)(fenc_acs>>32));
             satd >>= 1;
         }
         else
         {
             int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
-            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - sum_satd( h, size, x, y ));
+            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
         }
         satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
     }