[x264-devel] Merge zero buffers

Thu Jan 18 21:22:57 CET 2018

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Jan 16 17:43:24 2018 +0100| [b019515ef4ad77022b849283c62612157e8458a7] | committer: Henrik Gramner

Merge zero buffers

Improves cache efficiency.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=b019515ef4ad77022b849283c62612157e8458a7
---

 common/mc.c       |  2 +-
 common/mc.h       |  3 +--
 common/tables.c   |  3 +++
 common/tables.h   |  2 ++
 encoder/analyse.c |  6 ++----
 encoder/me.c      |  3 +--
 encoder/rdo.c     | 10 ++++------
 7 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/common/mc.c b/common/mc.c
index 38875b9a..d9e4e233 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -155,7 +155,7 @@ static weight_fn_t mc_weight_wtab[6] =
     mc_weight_w16,
     mc_weight_w20,
 };
-const x264_weight_t x264_weight_none[3] = { {{0}} };
+
 static void mc_copy( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, int i_width, int i_height )
 {
     for( int y = 0; y < i_height; y++ )
diff --git a/common/mc.h b/common/mc.h
index 8a33929c..8ec00750 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -244,8 +244,7 @@ typedef struct x264_weight_t
     weight_fn_t *weightfn;
 } ALIGNED_16( x264_weight_t );
 
-#define x264_weight_none x264_template(weight_none)
-extern const x264_weight_t x264_weight_none[3];
+#define x264_weight_none ((const x264_weight_t*)x264_zero)
 
 #define SET_WEIGHT( w, b, s, d, o )\
 {\
diff --git a/common/tables.c b/common/tables.c
index 79d99517..0edcfcc1 100644
--- a/common/tables.c
+++ b/common/tables.c
@@ -2534,3 +2534,6 @@ const vlc_t x264_run_before_init[7][16] =
         { 0x1, 11 }, /* str=00000000001 */
     },
 };
+
+/* psy_trellis_init() has the largest size requirement of 16*FDEC_STRIDE*sizeof(pixel) */
+ALIGNED_64( uint8_t x264_zero[1024] ) = { 0 };
diff --git a/common/tables.h b/common/tables.h
index 6ca74eae..88248ae8 100644
--- a/common/tables.h
+++ b/common/tables.h
@@ -94,4 +94,6 @@ extern const vlc_t x264_total_zeros_2x2_dc[3][4];
 extern const vlc_t x264_total_zeros_2x4_dc[7][8];
 extern const vlc_t x264_run_before_init[7][16];
 
+extern uint8_t x264_zero[1024];
+
 #endif
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 3577e5bc..ebae8a81 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -558,12 +558,10 @@ static ALWAYS_INLINE const int8_t *predict_4x4_mode_available( int force_intra,
 /* For trellis=2, we need to do this for both sizes of DCT, for trellis=1 we only need to use it on the chosen mode. */
 static void inline psy_trellis_init( x264_t *h, int do_both_dct )
 {
-    ALIGNED_64( static pixel zero[16*FDEC_STRIDE] ) = {0};
-
     if( do_both_dct || h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], zero );
+        h->dctf.sub16x16_dct8( h->mb.pic.fenc_dct8, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
     if( do_both_dct || !h->mb.b_transform_8x8 )
-        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], zero );
+        h->dctf.sub16x16_dct( h->mb.pic.fenc_dct4, h->mb.pic.p_fenc[0], (pixel*)x264_zero );
 }
 
 /* Reset fenc satd scores cache for psy RD */
diff --git a/encoder/me.c b/encoder/me.c
index fbb694b6..79bd4062 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -633,7 +633,6 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
             /* successive elimination by comparing DC before a full SAD,
              * because sum(abs(diff)) >= abs(diff(sum)). */
             uint16_t *sums_base = m->integral;
-            ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
             ALIGNED_ARRAY_16( int, enc_dc,[4] );
             int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
             int delta = x264_pixel_size[sad_size].w;
@@ -641,7 +640,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc,
             int xn;
             uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
 
-            h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
+            h->pixf.sad_x4[sad_size]( (pixel*)x264_zero, p_fenc, p_fenc+delta,
                 p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
                 FENC_STRIDE, enc_dc );
             if( delta == 4 )
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 41a5dddb..d884ee63 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -96,7 +96,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
     static const uint8_t satd_shift_x[3] = {3,   2,   2};
     static const uint8_t satd_shift_y[3] = {2-1, 3-2, 2-2};
     static const uint8_t  satd_offset[3] = {0,   8,   16};
-    ALIGNED_16( static pixel zero[16] ) = {0};
     int cache_index = (x >> satd_shift_x[size - PIXEL_8x4]) + (y >> satd_shift_y[size - PIXEL_8x4])
                     + satd_offset[size - PIXEL_8x4];
     int res = h->mb.pic.fenc_satd_cache[cache_index];
@@ -105,8 +104,8 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
     else
     {
         pixel *fenc = h->mb.pic.p_fenc[0] + x + y*FENC_STRIDE;
-        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, zero, 0 ) >> 1;
-        res = h->pixf.satd[size]( fenc, FENC_STRIDE, zero, 0 ) - dc;
+        int dc = h->pixf.sad[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+        res = h->pixf.satd[size]( fenc, FENC_STRIDE, (pixel*)x264_zero, 0 ) - dc;
         h->mb.pic.fenc_satd_cache[cache_index] = res + 1;
         return res;
     }
@@ -123,7 +122,6 @@ static ALWAYS_INLINE int cached_satd( x264_t *h, int size, int x, int y )
 
 static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
 {
-    ALIGNED_16( static pixel zero[16] ) = {0};
     int satd = 0;
     pixel *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE;
     pixel *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE;
@@ -140,8 +138,8 @@ static inline int ssd_plane( x264_t *h, int size, int p, int x, int y )
         }
         else
         {
-            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, zero, 0 ) >> 1;
-            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, zero, 0 ) - dc - cached_satd( h, size, x, y ));
+            int dc = h->pixf.sad[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) >> 1;
+            satd = abs(h->pixf.satd[size]( fdec, FDEC_STRIDE, (pixel*)x264_zero, 0 ) - dc - cached_satd( h, size, x, y ));
         }
         satd = (satd * h->mb.i_psy_rd * h->mb.i_psy_rd_lambda + 128) >> 8;
     }