[x264-devel] [Git][videolan/x264][master] 2 commits: Fix alignment of chroma buffer for weightp

Wed Jan 27 16:58:16 UTC 2021


Anton Mitrofanov pushed to branch master at VideoLAN / x264


Commits:
e32bff16 by Anton Mitrofanov at 2021-01-26T21:49:17+03:00
Fix alignment of chroma buffer for weightp

In 10-bit mode pixel_asd8 expects 16-byte alignment for pix1 and pix2.

- - - - -
b3aadb76 by Anton Mitrofanov at 2021-01-26T21:49:17+03:00
Fix PADH alignment

Make pointers to padded buffers aligned both before and after padding.

- - - - -


6 changed files:

- common/frame.c
- common/frame.h
- common/mc.c
- encoder/analyse.c
- encoder/encoder.c
- encoder/slicetype.c


Changes:

=====================================
common/frame.c
=====================================
@@ -38,7 +38,7 @@ static int align_stride( int x, int align, int disalign )
 static int align_plane_size( int x, int disalign )
 {
     if( !(x&(disalign-1)) )
-        x += 128;
+        x += X264_MAX( 128, NATIVE_ALIGN ) / SIZEOF_PIXEL;
     return x;
 }
 
@@ -63,29 +63,28 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
     int i_mb_count = h->mb.i_mb_count;
     int i_stride, i_width, i_lines, luma_plane_count;
     int i_padv = PADV << PARAM_INTERLACED;
-    int align = 16;
+    int align = NATIVE_ALIGN / SIZEOF_PIXEL;
 #if ARCH_X86 || ARCH_X86_64
     if( h->param.cpu&X264_CPU_CACHELINE_64 || h->param.cpu&X264_CPU_AVX512 )
-        align = 64;
+        align = 64 / SIZEOF_PIXEL;
     else if( h->param.cpu&X264_CPU_CACHELINE_32 || h->param.cpu&X264_CPU_AVX )
-        align = 32;
+        align = 32 / SIZEOF_PIXEL;
+    else
+        align = 16 / SIZEOF_PIXEL;
 #endif
 #if ARCH_PPC
-    int disalign = 1<<9;
+    int disalign = (1<<9) / SIZEOF_PIXEL;
 #else
-    int disalign = 1<<10;
+    int disalign = (1<<10) / SIZEOF_PIXEL;
 #endif
 
-    /* ensure frame alignment after PADH is added */
-    int padh_align = X264_MAX( align - PADH * SIZEOF_PIXEL, 0 ) / SIZEOF_PIXEL;
-
     CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
     PREALLOC_INIT
 
     /* allocate frame data (+64 for extra data for me) */
     i_width  = h->mb.i_mb_width*16;
     i_lines  = h->mb.i_mb_height*16;
-    i_stride = align_stride( i_width + 2*PADH, align, disalign );
+    i_stride = align_stride( i_width + PADH2, align, disalign );
 
     if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
     {
@@ -123,7 +122,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
     frame->i_csp = i_csp;
     frame->i_width_lowres = frame->i_width[0]/2;
     frame->i_lines_lowres = frame->i_lines[0]/2;
-    frame->i_stride_lowres = align_stride( frame->i_width_lowres + 2*PADH, align, disalign<<1 );
+    frame->i_stride_lowres = align_stride( frame->i_width_lowres + PADH2, align, disalign<<1 );
 
     for( int i = 0; i < h->param.i_bframe + 2; i++ )
         for( int j = 0; j < h->param.i_bframe + 2; j++ )
@@ -152,9 +151,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
     {
         int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
         int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
-        PREALLOC( frame->buffer[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL );
+        PREALLOC( frame->buffer[1], chroma_plane_size * SIZEOF_PIXEL );
         if( PARAM_INTERLACED )
-            PREALLOC( frame->buffer_fld[1], (chroma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_fld[1], chroma_plane_size * SIZEOF_PIXEL );
     }
 
     /* all 4 luma planes allocated together, since the cacheline split code
@@ -167,9 +166,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
             luma_plane_size *= 4;
 
         /* FIXME: Don't allocate both buffers in non-adaptive MBAFF. */
-        PREALLOC( frame->buffer[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL );
+        PREALLOC( frame->buffer[p], luma_plane_size * SIZEOF_PIXEL );
         if( PARAM_INTERLACED )
-            PREALLOC( frame->buffer_fld[p], (luma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_fld[p], luma_plane_size * SIZEOF_PIXEL );
     }
 
     frame->b_duplicate = 0;
@@ -207,7 +206,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
         {
             int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
 
-            PREALLOC( frame->buffer_lowres, (4 * luma_plane_size + padh_align) * SIZEOF_PIXEL );
+            PREALLOC( frame->buffer_lowres, 4 * luma_plane_size * SIZEOF_PIXEL );
 
             for( int j = 0; j <= !!h->param.i_bframe; j++ )
                 for( int i = 0; i <= h->param.i_bframe; i++ )
@@ -237,9 +236,9 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
     if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
     {
         int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
-        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
+        frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
         if( PARAM_INTERLACED )
-            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH + padh_align;
+            frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH_ALIGN;
     }
 
     for( int p = 0; p < luma_plane_count; p++ )
@@ -249,18 +248,18 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
         {
             for( int i = 0; i < 4; i++ )
             {
-                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
+                frame->filtered[p][i] = frame->buffer[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
                 if( PARAM_INTERLACED )
-                    frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH + padh_align;
+                    frame->filtered_fld[p][i] = frame->buffer_fld[p] + i*luma_plane_size + frame->i_stride[p] * i_padv + PADH_ALIGN;
             }
             frame->plane[p] = frame->filtered[p][0];
             frame->plane_fld[p] = frame->filtered_fld[p][0];
         }
         else
         {
-            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
+            frame->filtered[p][0] = frame->plane[p] = frame->buffer[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
             if( PARAM_INTERLACED )
-                frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH + padh_align;
+                frame->filtered_fld[p][0] = frame->plane_fld[p] = frame->buffer_fld[p] + frame->i_stride[p] * i_padv + PADH_ALIGN;
         }
     }
 
@@ -270,7 +269,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
         frame->mv16x16++;
 
         if( h->param.analyse.i_me_method >= X264_ME_ESA )
-            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH;
+            frame->integral = (uint16_t*)frame->buffer[3] + frame->i_stride[0] * i_padv + PADH_ALIGN;
     }
     else
     {
@@ -278,7 +277,7 @@ static x264_frame_t *frame_new( x264_t *h, int b_fdec )
         {
             int64_t luma_plane_size = align_plane_size( frame->i_stride_lowres * (frame->i_lines[0]/2 + 2*PADV), disalign );
             for( int i = 0; i < 4; i++ )
-                frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH + padh_align + i * luma_plane_size;
+                frame->lowres[i] = frame->buffer_lowres + frame->i_stride_lowres * PADV + PADH_ALIGN + i * luma_plane_size;
 
             for( int j = 0; j <= !!h->param.i_bframe; j++ )
                 for( int i = 0; i <= h->param.i_bframe; i++ )


=====================================
common/frame.h
=====================================
@@ -31,6 +31,8 @@
 /* number of pixels past the edge of the frame, for motion estimation/compensation */
 #define PADH 32
 #define PADV 32
+#define PADH_ALIGN X264_MAX( PADH, NATIVE_ALIGN / SIZEOF_PIXEL )
+#define PADH2 (PADH_ALIGN + PADH)
 
 typedef struct x264_frame
 {


=====================================
common/mc.c
=====================================
@@ -749,15 +749,15 @@ void x264_frame_filter( x264_t *h, x264_frame_t *frame, int mb_y, int b_end )
         int stride = frame->i_stride[0];
         if( start < 0 )
         {
-            memset( frame->integral - PADV * stride - PADH, 0, stride * sizeof(uint16_t) );
+            memset( frame->integral - PADV * stride - PADH_ALIGN, 0, stride * sizeof(uint16_t) );
             start = -PADV;
         }
         if( b_end )
             height += PADV-9;
         for( int y = start; y < height; y++ )
         {
-            pixel    *pix  = frame->plane[0] + y * stride - PADH;
-            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH;
+            pixel    *pix  = frame->plane[0] + y * stride - PADH_ALIGN;
+            uint16_t *sum8 = frame->integral + (y+1) * stride - PADH_ALIGN;
             uint16_t *sum4;
             if( h->frames.b_have_sub8x8_esa )
             {


=====================================
encoder/analyse.c
=====================================
@@ -223,10 +223,10 @@ void x264_analyse_weight_frame( x264_t *h, int end )
         if( h->sh.weight[j][0].weightfn )
         {
             x264_frame_t *frame = h->fref[0][j];
-            int width = frame->i_width[0] + 2*PADH;
+            int width = frame->i_width[0] + PADH2;
             int i_padv = PADV << PARAM_INTERLACED;
             int offset, height;
-            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH;
+            pixel *src = frame->filtered[0][0] - frame->i_stride[0]*i_padv - PADH_ALIGN;
             height = X264_MIN( 16 + end + i_padv, h->fref[0][j]->i_lines[0] + i_padv*2 ) - h->fenc->i_lines_weighted;
             offset = h->fenc->i_lines_weighted*frame->i_stride[0];
             h->fenc->i_lines_weighted += height;
@@ -234,7 +234,7 @@ void x264_analyse_weight_frame( x264_t *h, int end )
                 for( int k = j; k < h->i_ref[0]; k++ )
                     if( h->sh.weight[k][0].weightfn )
                     {
-                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH;
+                        pixel *dst = h->fenc->weighted[k] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN;
                         x264_weight_scale_plane( h, dst + offset, frame->i_stride[0],
                                                  src + offset, frame->i_stride[0],
                                                  width, height, &h->sh.weight[k][0] );


=====================================
encoder/encoder.c
=====================================
@@ -2185,14 +2185,14 @@ static void weighted_pred_init( x264_t *h )
                     assert( h->sh.weight[j][i].i_denom == denom );
                     if( !i )
                     {
-                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH;
+                        h->fenc->weighted[j] = h->mb.p_weight_buf[buffer_next++] + h->fenc->i_stride[0] * i_padv + PADH_ALIGN;
                         //scale full resolution frame
                         if( h->param.i_threads == 1 )
                         {
-                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH;
-                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH;
+                            pixel *src = h->fref[0][j]->filtered[0][0] - h->fref[0][j]->i_stride[0]*i_padv - PADH_ALIGN;
+                            pixel *dst = h->fenc->weighted[j] - h->fenc->i_stride[0]*i_padv - PADH_ALIGN;
                             int stride = h->fenc->i_stride[0];
-                            int width = h->fenc->i_width[0] + PADH*2;
+                            int width = h->fenc->i_width[0] + PADH2;
                             int height = h->fenc->i_lines[0] + i_padv*2;
                             x264_weight_scale_plane( h, dst, stride, src, stride, width, height, &h->sh.weight[j][0] );
                             h->fenc->i_lines_weighted = height;


=====================================
encoder/slicetype.c
=====================================
@@ -112,7 +112,6 @@ static NOINLINE void weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x26
 {
     int ref0_distance = fenc->i_frame - ref->i_frame - 1;
     int i_stride = fenc->i_stride[1];
-    int i_offset = i_stride / 2;
     int i_lines = fenc->i_lines[1];
     int i_width = fenc->i_width[1];
     int v_shift = CHROMA_V_SHIFT;
@@ -136,7 +135,7 @@ static NOINLINE void weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x26
     }
     else
         h->mc.plane_copy_deinterleave( dstu, i_stride, dstv, i_stride, ref->plane[1], i_stride, cw, ch );
-    h->mc.plane_copy_deinterleave( dstu+i_offset, i_stride, dstv+i_offset, i_stride, fenc->plane[1], i_stride, cw, ch );
+    h->mc.plane_copy_deinterleave( dstu+i_width, i_stride, dstv+i_width, i_stride, fenc->plane[1], i_stride, cw, ch );
     x264_emms();
 }
 
@@ -228,7 +227,7 @@ static NOINLINE unsigned int weight_cost_chroma( x264_t *h, x264_frame_t *fenc,
     int i_stride = fenc->i_stride[1];
     int i_lines = fenc->i_lines[1];
     int i_width = fenc->i_width[1];
-    pixel *src = ref + (i_stride >> 1);
+    pixel *src = ref + i_width;
     ALIGNED_ARRAY_16( pixel, buf, [8*16] );
     int pixoff = 0;
     int height = 16 >> CHROMA_V_SHIFT;
@@ -493,11 +492,11 @@ void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int
         //scale lowres in lookahead for slicetype_frame_cost
         pixel *src = ref->buffer_lowres;
         pixel *dst = h->mb.p_weight_buf[0];
-        int width = ref->i_width_lowres + PADH*2;
+        int width = ref->i_width_lowres + PADH2;
         int height = ref->i_lines_lowres + PADV*2;
         x264_weight_scale_plane( h, dst, ref->i_stride_lowres, src, ref->i_stride_lowres,
                                  width, height, &weights[0] );
-        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH + ref->i_stride_lowres * PADV;
+        fenc->weighted[0] = h->mb.p_weight_buf[0] + PADH_ALIGN + ref->i_stride_lowres * PADV;
     }
 }
 



View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/35417dcd65a57321fbadf98c9a4cff1cb741db4b...b3aadb76329d3c2aedac85142441476bbe5f002c

-- 
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/35417dcd65a57321fbadf98c9a4cff1cb741db4b...b3aadb76329d3c2aedac85142441476bbe5f002c
You're receiving this email because of your account on code.videolan.org.