[x264-devel] Faster pixel_memset

Jason Garrett-Glaser git at videolan.org
Thu May 12 08:39:12 CEST 2011


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Wed May  4 23:26:19 2011 -0700| [160557abe60d0671fbee282b51bd38fe764917c3] | committer: Jason Garrett-Glaser

Faster pixel_memset
~4x faster.
Also inline plane_expand_border for improved constant propagation.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=160557abe60d0671fbee282b51bd38fe764917c3
---

 common/frame.c |   55 ++++++++++++++++++++++++++++++++++++++++++++-----------
 1 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index bca4f1f..2d56921 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -331,23 +331,56 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
 static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
 {
     uint8_t *dstp = (uint8_t*)dst;
-    if( size == 1 )
-        memset(dst, *src, len);
-    else if( size == 2 )
+    uint8_t  v1 = *src;
+    uint16_t v2 = size == 1 ? v1 + (v1 <<  8) : M16( src );
+    uint32_t v4 = size <= 2 ? v2 + (v2 << 16) : M32( src );
+    int i = 0;
+    len *= size;
+
+    /* Align the input pointer if it isn't already */
+    if( (intptr_t)dstp & (WORD_SIZE - 1) )
+    {
+        if( size <= 2 && ((intptr_t)dstp & 3) )
+        {
+            if( size == 1 && ((intptr_t)dstp & 1) )
+                dstp[i++] = v1;
+            if( (intptr_t)dstp & 2 )
+            {
+                M16( dstp+i ) = v2;
+                i += 2;
+            }
+        }
+        if( WORD_SIZE == 8 && (intptr_t)dstp & 4 )
+        {
+            M32( dstp+i ) = v4;
+            i += 4;
+        }
+    }
+
+    /* Main copy loop */
+    if( WORD_SIZE == 8 )
     {
-        int v = M16( src );
-        for( int i = 0; i < len; i++ )
-            M16( dstp+i*2 ) = v;
+        uint64_t v8 = v4 + ((uint64_t)v4<<32);
+        for( ; i < len - 7; i+=8 )
+            M64( dstp+i ) = v8;
     }
-    else if( size == 4 )
+    for( ; i < len - 3; i+=4 )
+        M32( dstp+i ) = v4;
+
+    /* Finish up the last few bytes */
+    if( size <= 2 )
     {
-        int v = M32( src );
-        for( int i = 0; i < len; i++ )
-            M32( dstp+i*4 ) = v;
+        if( i < len - 1 )
+        {
+            M16( dstp+i ) = v2;
+            i += 2;
+        }
+        if( size == 1 && i != len )
+            dstp[i] = v1;
     }
 }
 
-static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
+static void ALWAYS_INLINE plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
 {
 #define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
     for( int y = 0; y < i_height; y++ )



More information about the x264-devel mailing list