[x264-devel] commit: fix an overflow in dct4x4dc_mmx (Loren Merritt )

Thu Nov 27 08:26:33 CET 2008

x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Thu Nov 27 02:37:46 2008 +0000| [2338e1301aded556be8b85c6c3b4050e562ed862] | committer: Loren Merritt 

fix an overflow in dct4x4dc_mmx
(unlikely to have occurred in any real video)

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2338e1301aded556be8b85c6c3b4050e562ed862
---

 common/x86/dct-a.asm |   36 ++++++++++++++++----------
 tools/checkasm.c     |   68 ++++++++++++++++++++++---------------------------
 2 files changed, 53 insertions(+), 51 deletions(-)

diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index e95006d..73922fc 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -26,8 +26,8 @@
 %include "x86util.asm"
 
 SECTION_RODATA
-pw_1:  times 8 dw 1
 pw_32: times 8 dw 32
+pw_8000: times 8 dw 0x8000
 pb_sub4frame:   db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
 pb_scan4framea: db 12,13,6,7,14,15,0,1,8,9,2,3,4,5,10,11
 pb_scan4frameb: db 0,1,8,9,2,3,4,5,10,11,12,13,6,7,14,15
@@ -40,6 +40,18 @@ SECTION .text
     SWAP %1, %4, %3
 %endmacro
 
+%macro SUMSUB_17BIT 4 ; a, b, tmp, 0x8000
+    movq  m%3, m%4
+    paddw m%1, m%4
+    psubw m%3, m%2
+    paddw m%2, m%4
+    pavgw m%3, m%1
+    pavgw m%2, m%1
+    psubw m%3, m%4
+    psubw m%2, m%4
+    SWAP %1, %2, %3
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void x264_dct4x4dc_mmx( int16_t d[4][4] )
 ;-----------------------------------------------------------------------------
@@ -48,22 +60,18 @@ cglobal x264_dct4x4dc_mmx, 1,1
     movq   m1, [r0+ 8]
     movq   m2, [r0+16]
     movq   m3, [r0+24]
+    movq   m7, [pw_8000 GLOBAL] ; convert to unsigned and back, so that pavgw works
     HADAMARD4_1D  0,1,2,3
     TRANSPOSE4x4W 0,1,2,3,4
-    HADAMARD4_1D  0,1,2,3
-    movq   m6, [pw_1 GLOBAL]
-    paddw  m0, m6
-    paddw  m1, m6
-    paddw  m2, m6
-    paddw  m3, m6
-    psraw  m0, 1
-    psraw  m1, 1
-    psraw  m2, 1
-    psraw  m3, 1
+    SUMSUB_BADC m1, m0, m3, m2
+    SWAP 0,1
+    SWAP 2,3
+    SUMSUB_17BIT 0,2,4,7
+    SUMSUB_17BIT 1,3,5,7
     movq  [r0+0], m0
-    movq  [r0+8], m1
-    movq [r0+16], m2
-    movq [r0+24], m3
+    movq  [r0+8], m2
+    movq [r0+16], m3
+    movq [r0+24], m1
     RET
 
 ;-----------------------------------------------------------------------------
diff --git a/tools/checkasm.c b/tools/checkasm.c
index a66b4e0..12110a5 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -466,7 +466,7 @@ static int check_dct( int cpu_ref, int cpu_new )
     x264_dct_function_t dct_ref;
     x264_dct_function_t dct_asm;
     x264_quant_function_t qf;
-    int ret = 0, ok, used_asm, i, interlace;
+    int ret = 0, ok, used_asm, i, j, interlace;
     DECLARE_ALIGNED_16( int16_t dct1[16][4][4] );
     DECLARE_ALIGNED_16( int16_t dct2[16][4][4] );
     DECLARE_ALIGNED_16( int16_t dct4[16][4][4] );
@@ -560,40 +560,33 @@ static int check_dct( int cpu_ref, int cpu_new )
     report( "add_idct8 :" );
 #undef TEST_IDCT
 
-    ok = 1; used_asm = 0;
-    if( dct_asm.dct4x4dc != dct_ref.dct4x4dc )
-    {
-        DECLARE_ALIGNED_16( int16_t dct1[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
-        DECLARE_ALIGNED_16( int16_t dct2[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
-        set_func_name( "dct4x4dc" );
-        used_asm = 1;
-        call_c1( dct_c.dct4x4dc, dct1 );
-        call_a1( dct_asm.dct4x4dc, dct2 );
-        if( memcmp( dct1, dct2, 32 ) )
-        {
-            ok = 0;
-            fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
-        }
-        call_c2( dct_c.dct4x4dc, dct1 );
-        call_a2( dct_asm.dct4x4dc, dct2 );
-    }
-    if( dct_asm.idct4x4dc != dct_ref.idct4x4dc )
-    {
-        DECLARE_ALIGNED_16( int16_t dct1[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
-        DECLARE_ALIGNED_16( int16_t dct2[4][4] ) = {{-12, 42, 23, 67},{2, 90, 89,56},{67,43,-76,91},{56,-78,-54,1}};
-        set_func_name( "idct4x4dc" );
-        used_asm = 1;
-        call_c1( dct_c.idct4x4dc, dct1 );
-        call_a1( dct_asm.idct4x4dc, dct2 );
-        if( memcmp( dct1, dct2, 32 ) )
-        {
-            ok = 0;
-            fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
-        }
-        call_c2( dct_c.idct4x4dc, dct1 );
-        call_a2( dct_asm.idct4x4dc, dct2 );
-    }
-    report( "(i)dct4x4dc :" );
+#define TEST_DCTDC( name )\
+    ok = 1; used_asm = 0;\
+    if( dct_asm.name != dct_ref.name )\
+    {\
+        set_func_name( #name );\
+        used_asm = 1;\
+        uint16_t *p = (uint16_t*)buf1;\
+        for( i=0; i<16 && ok; i++ )\
+        {\
+            for( j=0; j<16; j++ )\
+                dct1[0][0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
+                              : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
+                              : ((*p++)&0x1fff)-0x1000; /* general case */\
+            memcpy( dct2, dct1, 32 );\
+            call_c1( dct_c.name, dct1[0] );\
+            call_a1( dct_asm.name, dct2[0] );\
+            if( memcmp( dct1, dct2, 32 ) )\
+                ok = 0;\
+        }\
+        call_c2( dct_c.name, dct1[0] );\
+        call_a2( dct_asm.name, dct2[0] );\
+    }\
+    report( #name " :" );
+
+    TEST_DCTDC(  dct4x4dc );
+    TEST_DCTDC( idct4x4dc );
+#undef TEST_DCTDC
 
     x264_zigzag_function_t zigzag_c;
     x264_zigzag_function_t zigzag_ref;
@@ -1086,14 +1079,14 @@ static int check_quant( int cpu_ref, int cpu_new )
     ok = oks[1]; used_asm = used_asms[1];
     report( "dequant :" );
 
-
+    ok = 1;
     if( qf_a.denoise_dct != qf_ref.denoise_dct )
     {
         int size;
+        used_asm = 1;
         for( size = 16; size <= 64; size += 48 )
         {
             set_func_name( "denoise_dct" );
-            used_asm = 1;
             memcpy(dct1, buf1, size*2);
             memcpy(dct2, buf1, size*2);
             memcpy(buf3+256, buf3, 256);
@@ -1133,6 +1126,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         } \
     }
 
+    ok = 1;
     TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
     TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
     TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );