[x264-devel] Fix some store forwarding stalls

Jason Garrett-Glaser git at videolan.org
Wed Feb 27 00:18:06 CET 2013


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Wed Feb  6 16:55:39 2013 -0800| [9d600d64194e0b2a77a8d9aa3f05b141cf473af0] | committer: Jason Garrett-Glaser

Fix some store forwarding stalls
There's quite a few others, but most of them don't help to fix or there's no
easy way to avoid them.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9d600d64194e0b2a77a8d9aa3f05b141cf473af0
---

 common/x86/pixel-a.asm |    4 ++--
 encoder/slicetype.c    |   13 +++++++------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 97adec5..bf0d27a 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4410,8 +4410,8 @@ cglobal pixel_ads4, 5,7,12
     punpckhqdq xmm5, xmm5
     punpckhqdq xmm4, xmm4
 %if ARCH_X86_64
-    pshuflw xmm8, r6m, 0
-    punpcklqdq xmm8, xmm8
+    movd    xmm8, r6m
+    SPLATW  xmm8, xmm8
     ADS_START
     movdqu  xmm10, [r1]
     movdqu  xmm11, [r1+r2]
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 99973b9..8a6c226 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -633,15 +633,16 @@ lowres_intra_mb:
     if( !fenc->b_intra_calculated )
     {
         ALIGNED_ARRAY_16( pixel, edge,[36] );
-        pixel *pix = &pix1[8+FDEC_STRIDE - 1];
-        pixel *src = &fenc->lowres[0][i_pel_offset - 1];
+        pixel *pix = &pix1[8+FDEC_STRIDE];
+        pixel *src = &fenc->lowres[0][i_pel_offset];
         const int intra_penalty = 5 * a->i_lambda;
         int satds[3];
+        int pixoff = 4 / sizeof(pixel);
 
-        memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
-        for( int i = 0; i < 8; i++ )
-            pix[i*FDEC_STRIDE] = src[i*i_stride];
-        pix++;
+        /* Avoid store forwarding stalls by writing larger chunks */
+        memcpy( pix-FDEC_STRIDE, src-i_stride, 16 * sizeof(pixel) );
+        for( int i = -1; i < 8; i++ )
+            M32( &pix[i*FDEC_STRIDE-pixoff] ) = M32( &src[i*i_stride-pixoff] );
 
         h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[0], pix, satds );
         int i_icost = X264_MIN3( satds[0], satds[1], satds[2] );



More information about the x264-devel mailing list