[x264-devel] Missing emms in x264 (+ workaround patch)

Mon Oct 9 03:07:12 CEST 2006

After spending several hours trying to track down mysterious data
corruption while working on an x264 encoding module for transcode, I've
discovered that there's at least one case when an EMMS instruction is not
executed by x264 (SVN as of ~7 hours ago) after calling MMX/SSE assembly
routines.  I don't have the time to track down exactly which routine it is;
hopefully someone here that has more experience with the code can take care
of that.  In the meantime, the patch below (which just tacks an EMMS onto
the end of every assembly routine) seems to have resolved the problem.

     For the record, here are the symptoms (in all cases using the default
x264 parameter set, except in the "bframes=2" example):

### Works fine (image size = 1048576 pixels):
$ transcode -i source -c 0-30 -o test.avi -y x264,null,avi -Z 1024x1024

### Corruption in lower right corner of image for P-frames (image size =
### 1048580 pixels):
$ transcode -i source -c 0-30 -o test.avi -y x264,null,avi -Z 1090x962

### Corruption throughout lower half of image for P-frames:
$ transcode -i source -c 0-30 -o test.avi -y x264,null,avi -Z 1440x1080

### Corruption throughout lower half of image for P- and B-frames, and the
### progress counter's frames-per-second calculation (floating point) gives
### a NaN:
$ transcode -i source -c 0-30 -o test.avi -y x264=bframes=2,null,avi \
            -Z 1440x1080

  --Andrew Church
    achurch at achurch.org
    http://achurch.org/

---------------------------------------------------------------------------

diff -urN ../x264-svn-20061008.2019-orig/common/i386/dct-a.asm common/i386/dct-a.asm

--- ../x264-svn-20061008.2019-orig/common/i386/dct-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/dct-a.asm	2006-10-09 02:51:40 +0900
@@ -182,6 +182,7 @@
     psraw   mm4,        1
     movq    [eax+24],   mm4
     picpop  ebx
+    emms
     ret
 
 cglobal x264_idct4x4dc_mmx
@@ -209,6 +210,7 @@
     movq    [eax+ 8],   mm2
     movq    [eax+16],   mm3
     movq    [eax+24],   mm4
+    emms
     ret
 
 cglobal x264_sub4x4_dct_mmx
@@ -248,6 +250,7 @@
     movq    [eax+16],   mm3
     movq    [eax+24],   mm0
 
+    emms
     ret
 
 cglobal x264_add4x4_idct_mmx
@@ -291,6 +294,7 @@
     MMX_STORE_DIFF_4P   mm3, mm0, mm6, mm7, [eax+3*FDEC_STRIDE]
 
     picpop  ebx
+    emms
     ret
 
 
@@ -343,6 +347,7 @@
     %assign disp disp+16
     %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -428,6 +433,7 @@
     %assign disp disp+8
     %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -522,6 +528,7 @@
     %assign disp disp+8
     %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -551,6 +558,7 @@
     add         eax, FDEC_STRIDE
     %assign disp disp+16
     %endrep
+    emms
     ret
 
 ALIGN 16
@@ -600,6 +608,7 @@
     movq  [eax+ 40], mm4
     movq  [eax+ 56], mm6
 
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -729,4 +738,5 @@
     movq      [ecx+56], mm7
     movq      [ecx+12], mm1
     movd      [ecx+ 8], mm2
+    emms
     ret
diff -urN ../x264-svn-20061008.2019-orig/common/i386/deblock-a.asm common/i386/deblock-a.asm
--- ../x264-svn-20061008.2019-orig/common/i386/deblock-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/deblock-a.asm	2006-10-09 02:51:22 +0900
@@ -311,6 +311,7 @@
     pop     esi
     pop     edi
     picpop  ebx
+    emms
     ret
 
 
@@ -371,6 +372,7 @@
     add    esp, 96
     pop    ebp
     pop    ebx
+    emms
     ret
 
 
@@ -409,6 +411,7 @@
 %macro CHROMA_END 0
     pop  esi
     pop  edi
+    emms
     ret
 %endmacro
 
diff -urN ../x264-svn-20061008.2019-orig/common/i386/mc-a.asm common/i386/mc-a.asm
--- ../x264-svn-20061008.2019-orig/common/i386/mc-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/mc-a.asm	2006-10-09 02:51:14 +0900
@@ -115,6 +115,7 @@
     pop         esi
     pop         ebx
     pop         ebp
+    emms
     ret
 
                           
@@ -154,6 +155,7 @@
     pop         esi
     pop         ebx
     pop         ebp
+    emms
     ret
 
 
@@ -196,6 +198,7 @@
     pop         esi
     pop         ebx
     pop         ebp
+    emms
     ret
 
 ALIGN 16
@@ -234,6 +237,7 @@
     pop         esi
     pop         ebx
     pop         ebp
+    emms
     ret
 
 
@@ -278,6 +282,7 @@
     picpop  ebx
     pop     esi
     pop     edi
+    emms
     ret
 %endmacro
 
@@ -373,6 +378,7 @@
     pop     edi
     pop     esi
     pop     ebx
+    emms
     ret
 
 ALIGN 16
@@ -411,6 +417,7 @@
     pop     edi
     pop     esi
     pop     ebx
+    emms
     ret
 
 ALIGN 16
@@ -457,6 +464,7 @@
     pop     edi
     pop     esi
     pop     ebx
+    emms
     ret
 
 
@@ -490,6 +498,7 @@
     pop     edi
     pop     esi
     pop     ebx
+    emms
     ret
 
 
@@ -594,4 +603,5 @@
 .finish
     pop     edi
     picpop  ebx
+    emms
     ret
diff -urN ../x264-svn-20061008.2019-orig/common/i386/mc-a2.asm common/i386/mc-a2.asm
--- ../x264-svn-20061008.2019-orig/common/i386/mc-a2.asm	2006-10-08 20:19:42 +0900
+++ common/i386/mc-a2.asm	2006-10-09 02:50:57 +0900
@@ -267,6 +267,7 @@
     pop         esi
     pop         ebx
     pop         ebp
+    emms
     ret
 
 
@@ -328,5 +329,6 @@
     pop    ebx
     pop    esi
     pop    edi
+    emms
     ret
 
diff -urN ../x264-svn-20061008.2019-orig/common/i386/pixel-a.asm common/i386/pixel-a.asm
--- ../x264-svn-20061008.2019-orig/common/i386/pixel-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/pixel-a.asm	2006-10-09 02:50:52 +0900
@@ -250,6 +250,7 @@
     movd    [eax+8], mm2
     pop     esi
     pop     edi
+    emms
     ret
 %endmacro
 
@@ -262,6 +263,7 @@
     pop     ebx
     pop     esi
     pop     edi
+    emms
     ret
 %endmacro
 
@@ -510,6 +512,7 @@
     movd eax,    mm0
 
     pop ebx
+    emms
     ret
 %endmacro
 
@@ -580,6 +583,7 @@
     jl   .continue
     pop  ebx
     mov  eax, 0xffff
+    emms
     ret
 ALIGN 4
 .continue:
@@ -616,6 +620,7 @@
     movd    eax,    mm0
 
     pop ebx
+    emms
     ret
 %endmacro
 
@@ -659,6 +664,7 @@
     movd        eax, mm0
     and         eax, 0xffff
     pop         ebx
+    emms
     ret
 %endmacro
 
@@ -826,6 +832,7 @@
     paddd       mm0, mm1
     movd        eax, mm0
     pop         ebx
+    emms
     ret
 
 
@@ -947,6 +954,7 @@
     shr    eax, 1
     add    esp, 0x70
     pop    ebx
+    emms
     ret
 %undef args
 %undef spill
@@ -988,6 +996,7 @@
     pop    ebp
     pop    edi
     pop    esi
+    emms
     ret
 
 
@@ -1117,6 +1126,7 @@
     pop  esi
     pop  edi
     pop  ebx
+    emms
     ret
 
 ALIGN 16
@@ -1248,6 +1258,7 @@
     pop  edi
     pop  ebp
     pop  ebx
+    emms
     ret
 
 ALIGN 16
@@ -1393,6 +1404,7 @@
     pop  edi
     pop  ebp
     pop  ebx
+    emms
     ret
 
 %macro LOAD_4x8P 1 ; dx
@@ -1570,6 +1582,7 @@
     mov      [eax+8], ecx ; i8x8_dc satd
 
     add      esp, 0x70
+    emms
     ret
 %undef args
 %undef spill
@@ -1683,6 +1696,7 @@
     sub     edx, 4
     jg      .loop
     pop     ebx
+    emms
     ret
 
 ALIGN 16
@@ -1711,6 +1725,7 @@
     sub     edx, 4
     jg      .loop
     pop     ebx
+    emms
     ret
 
 ALIGN 16
@@ -1734,4 +1749,5 @@
     sub     edx, 8
     jg      .loop
     nop
+    emms
     ret
diff -urN ../x264-svn-20061008.2019-orig/common/i386/pixel-sse2.asm common/i386/pixel-sse2.asm
--- ../x264-svn-20061008.2019-orig/common/i386/pixel-sse2.asm	2006-10-08 20:19:42 +0900
+++ common/i386/pixel-sse2.asm	2006-10-09 02:50:26 +0900
@@ -103,6 +103,7 @@
     movd    eax,  xmm0
 
     pop ebx
+    emms
     ret
 %endmacro
 
@@ -294,6 +295,7 @@
     movd    [eax+8], xmm2
     pop     esi
     pop     edi
+    emms
     ret
 %endmacro
 
@@ -314,6 +316,7 @@
     pop     ebx
     pop     esi
     pop     edi
+    emms
     ret
 %endmacro
 
@@ -397,6 +400,7 @@
     movd    eax,    xmm0
 
     pop ebx
+    emms
     ret
 %endmacro
 
@@ -540,6 +544,7 @@
     HADDW   xmm6, xmm7
     movd    eax,  xmm6
     pop     ebx
+    emms
     ret
 %endmacro
 
@@ -682,6 +687,7 @@
     movq      [eax+16], xmm1
     movq      [eax+24], xmm5
     pop       ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -746,5 +752,6 @@
     movd     [picesp+4], xmm0
     fld      dword [picesp+4]
     picpop   ebx
+    emms
     ret
 
diff -urN ../x264-svn-20061008.2019-orig/common/i386/predict-a.asm common/i386/predict-a.asm
--- ../x264-svn-20061008.2019-orig/common/i386/predict-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/predict-a.asm	2006-10-09 02:50:13 +0900
@@ -108,6 +108,7 @@
     mov         edx, [esp+4]
     movq        mm0, [eax+16]
     STORE8x8    mm0, mm0
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -131,6 +132,7 @@
     packuswb    mm0, mm0
     STORE8x8    mm0, mm0
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -151,6 +153,7 @@
     packuswb    mm0, mm0
     STORE8x8    mm0, mm0
     picpop      ebx
+    emms
     ret
 %endmacro
 
@@ -192,6 +195,7 @@
     movq        [edx + Y*FDEC_STRIDE], mm1
 
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -229,6 +233,7 @@
     movq        [edx + Y*FDEC_STRIDE], mm0
 
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -271,6 +276,7 @@
     movq        [edx + (Y+1)*FDEC_STRIDE], mm0
 
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -282,6 +288,7 @@
     mov         edx, [esp + 4]
     movq        mm0, [edx - FDEC_STRIDE]
     STORE8x8    mm0, mm0
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -321,6 +328,7 @@
     STORE8x8    mm0, mm2
 
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -361,6 +369,7 @@
 
     nop
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -416,6 +425,7 @@
 
     nop
     picpop      ebx
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -453,6 +463,7 @@
     SAVE_0_1    (edx + 2 * ecx)         ; 14
     SAVE_0_1    (edx + eax)             ; 15
 
+    emms
     ret
 
 ;-----------------------------------------------------------------------------
@@ -493,6 +504,7 @@
 ALIGN 16
 predict_16x16_dc_core_mmxext:
     PRED16x16_DC [esp+8], 5, esp
+    emms
     ret
 
 ALIGN 16
@@ -501,5 +513,6 @@
     picgetgot ebx
     PRED16x16_DC [pw_8 GOT_ebx], 4, picesp
     picpop ebx
+    emms
     ret
 
diff -urN ../x264-svn-20061008.2019-orig/common/i386/quant-a.asm common/i386/quant-a.asm
--- ../x264-svn-20061008.2019-orig/common/i386/quant-a.asm	2006-10-08 20:19:42 +0900
+++ common/i386/quant-a.asm	2006-10-09 02:49:49 +0900
@@ -115,6 +115,7 @@
 x264_quant_2x2_dc_core15_mmx:
     MMX_QUANT15_DC_START
     MMX_QUANT15_1x4 [eax], mm5, mm6, mm7
+    emms
     ret
 
 ALIGN 16
@@ -130,6 +131,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -148,6 +150,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -166,6 +169,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ; ============================================================================
@@ -218,6 +222,7 @@
 x264_quant_2x2_dc_core16_mmxext:
     MMXEXT_QUANT16_DC_START
     MMXEXT_QUANT16_1x4 [eax], mm5, mm6, mm7
+    emms
     ret
 
 ALIGN 16
@@ -233,6 +238,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -252,6 +258,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -271,6 +278,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 
@@ -329,6 +337,7 @@
 x264_quant_2x2_dc_core32_mmxext:
     MMX_QUANT32_DC_START
     MMXEXT_QUANT32_1x4 [eax], mm5, mm5, mm6, mm7
+    emms
     ret
 
 ALIGN 16
@@ -344,6 +353,7 @@
     add         eax, byte 8
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -360,6 +370,7 @@
     add         ecx, byte 16
 %endrep
 
+    emms
     ret
 
 ALIGN 16
@@ -376,6 +387,7 @@
     add         ecx, byte 16
 %endrep
 
+    emms
     ret
 
 
@@ -474,6 +486,7 @@
     jge  .loopl16
 
     nop
+    emms
     ret
 
 .rshift32:
@@ -496,6 +509,7 @@
     jge  .loopr32
 
     nop
+    emms
     ret
 %endmacro
 

-- 
This is the x264-devel mailing-list
To unsubscribe, go to: http://developers.videolan.org/lists.html