[x264-devel] Add checkasm tests for memcpy_aligned, memzero_aligned

Jason Garrett-Glaser git at videolan.org
Thu May 12 08:39:11 CEST 2011


x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu May  5 00:42:43 2011 -0700| [9202f134f4afdce9c142907b99c8501bb6c70b42] | committer: Jason Garrett-Glaser

Add checkasm tests for memcpy_aligned, memzero_aligned
Also make memcpy_aligned support sizes smaller than 64.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9202f134f4afdce9c142907b99c8501bb6c70b42
---

 common/arm/mc-a.S    |    8 ++++++--
 common/x86/mc-a2.asm |   14 +++++++++++---
 tools/checkasm.c     |   38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 62e88cc..08daa4a 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -106,17 +106,21 @@ function memcpy_aligned_\dstalign\()_\srcalign\()_neon
     vst1.64     {d0-d1}, [r3,:r3align]!
 32: // n is a multiple of 32
     tst         r2, #32
-    beq         64f
+    beq         640f
     sub         r2, #32
     vld1.64     {d0-d3}, [r1,:r1align]!
     vst1.64     {d0-d3}, [r3,:r3align]!
-64: // n is a multiple of 64
+640: // n is a multiple of 64
+    cmp         r2, #0
+    beq         1f
+64:
     subs        r2, #64
     vld1.64     {d0-d3}, [r1,:r1align]!
     vld1.64     {d4-d7}, [r1,:r1align]!
     vst1.64     {d0-d3}, [r3,:r3align]!
     vst1.64     {d4-d7}, [r3,:r3align]!
     bgt         64b
+1:   // end
 .if \srcalign == 8 && \dstalign == 8
     vld1.64     {d0}, [r1,:64]!
     vst1.64     {d0}, [r3,:64]!
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index d4ea8b2..69b0d6a 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1128,7 +1128,7 @@ PLANE_DEINTERLEAVE ssse3
 %endif
 
 ; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size or a size less than 64.
+; but they also will fail if given a non-mod16 size.
 ; memzero SSE will fail for non-mod128.
 
 ;-----------------------------------------------------------------------------
@@ -1136,12 +1136,15 @@ PLANE_DEINTERLEAVE ssse3
 ;-----------------------------------------------------------------------------
 cglobal memcpy_aligned_mmx, 3,3
     test r2d, 16
-    jz .copy32
+    jz .copy32start
     sub r2d, 16
     movq mm0, [r1 + r2 + 0]
     movq mm1, [r1 + r2 + 8]
     movq [r0 + r2 + 0], mm0
     movq [r0 + r2 + 8], mm1
+.copy32start
+    test r2d, r2d
+    jz .ret
 .copy32:
     sub r2d, 32
     movq mm0, [r1 + r2 +  0]
@@ -1153,6 +1156,7 @@ cglobal memcpy_aligned_mmx, 3,3
     movq [r0 + r2 + 16], mm2
     movq [r0 + r2 + 24], mm3
     jg .copy32
+.ret
     REP_RET
 
 ;-----------------------------------------------------------------------------
@@ -1166,12 +1170,15 @@ cglobal memcpy_aligned_sse2, 3,3
     movdqa [r0 + r2], xmm0
 .copy32:
     test r2d, 32
-    jz .copy64
+    jz .copy64start
     sub r2d, 32
     movdqa xmm0, [r1 + r2 +  0]
     movdqa [r0 + r2 +  0], xmm0
     movdqa xmm1, [r1 + r2 + 16]
     movdqa [r0 + r2 + 16], xmm1
+.copy64start
+    test r2d, r2d
+    jz .ret
 .copy64:
     sub r2d, 64
     movdqa xmm0, [r1 + r2 +  0]
@@ -1183,6 +1190,7 @@ cglobal memcpy_aligned_sse2, 3,3
     movdqa xmm3, [r1 + r2 + 48]
     movdqa [r0 + r2 + 48], xmm3
     jg .copy64
+.ret:
     REP_RET
 
 ;-----------------------------------------------------------------------------
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4f66cfc..a0ae4fc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1276,6 +1276,44 @@ static int check_mc( int cpu_ref, int cpu_new )
         report( "mbtree propagate :" );
     }
 
+    if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
+    {
+        set_func_name( "memcpy_aligned" );
+        ok = 1; used_asm = 1;
+        for( int size = 16; size < 256; size += 16 )
+        {
+            memset( buf4, 0xAA, size + 1 );
+            call_c( mc_c.memcpy_aligned, buf3, buf1, size );
+            call_a( mc_a.memcpy_aligned, buf4, buf1, size );
+            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+            {
+                ok = 0;
+                fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
+                break;
+            }
+        }
+        report( "memcpy aligned :" );
+    }
+
+    if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
+    {
+        set_func_name( "memzero_aligned" );
+        ok = 1; used_asm = 1;
+        for( int size = 128; size < 1024; size += 128 )
+        {
+            memset( buf4, 0xAA, size + 1 );
+            call_c( mc_c.memzero_aligned, buf3, size );
+            call_a( mc_a.memzero_aligned, buf4, size );
+            if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+            {
+                ok = 0;
+                fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
+                break;
+            }
+        }
+        report( "memzero aligned :" );
+    }
+
     return ret;
 }
 



More information about the x264-devel mailing list