[x264-devel] Add checkasm tests for memcpy_aligned, memzero_aligned
Jason Garrett-Glaser
git at videolan.org
Thu May 12 08:39:11 CEST 2011
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Thu May 5 00:42:43 2011 -0700| [9202f134f4afdce9c142907b99c8501bb6c70b42] | committer: Jason Garrett-Glaser
Add checkasm tests for memcpy_aligned, memzero_aligned
Also make memcpy_aligned support sizes smaller than 64.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9202f134f4afdce9c142907b99c8501bb6c70b42
---
common/arm/mc-a.S | 8 ++++++--
common/x86/mc-a2.asm | 14 +++++++++++---
tools/checkasm.c | 38 ++++++++++++++++++++++++++++++++++++++
3 files changed, 55 insertions(+), 5 deletions(-)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 62e88cc..08daa4a 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -106,17 +106,21 @@ function memcpy_aligned_\dstalign\()_\srcalign\()_neon
vst1.64 {d0-d1}, [r3,:r3align]!
32: // n is a multiple of 32
tst r2, #32
- beq 64f
+ beq 640f
sub r2, #32
vld1.64 {d0-d3}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
-64: // n is a multiple of 64
+640: // n is a multiple of 64
+ cmp r2, #0
+ beq 1f
+64:
subs r2, #64
vld1.64 {d0-d3}, [r1,:r1align]!
vld1.64 {d4-d7}, [r1,:r1align]!
vst1.64 {d0-d3}, [r3,:r3align]!
vst1.64 {d4-d7}, [r3,:r3align]!
bgt 64b
+1: // end
.if \srcalign == 8 && \dstalign == 8
vld1.64 {d0}, [r1,:64]!
vst1.64 {d0}, [r3,:64]!
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index d4ea8b2..69b0d6a 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1128,7 +1128,7 @@ PLANE_DEINTERLEAVE ssse3
%endif
; These functions are not general-use; not only do the SSE ones require aligned input,
-; but they also will fail if given a non-mod16 size or a size less than 64.
+; but they also will fail if given a non-mod16 size.
; memzero SSE will fail for non-mod128.
;-----------------------------------------------------------------------------
@@ -1136,12 +1136,15 @@ PLANE_DEINTERLEAVE ssse3
;-----------------------------------------------------------------------------
cglobal memcpy_aligned_mmx, 3,3
test r2d, 16
- jz .copy32
+ jz .copy32start
sub r2d, 16
movq mm0, [r1 + r2 + 0]
movq mm1, [r1 + r2 + 8]
movq [r0 + r2 + 0], mm0
movq [r0 + r2 + 8], mm1
+.copy32start
+ test r2d, r2d
+ jz .ret
.copy32:
sub r2d, 32
movq mm0, [r1 + r2 + 0]
@@ -1153,6 +1156,7 @@ cglobal memcpy_aligned_mmx, 3,3
movq [r0 + r2 + 16], mm2
movq [r0 + r2 + 24], mm3
jg .copy32
+.ret
REP_RET
;-----------------------------------------------------------------------------
@@ -1166,12 +1170,15 @@ cglobal memcpy_aligned_sse2, 3,3
movdqa [r0 + r2], xmm0
.copy32:
test r2d, 32
- jz .copy64
+ jz .copy64start
sub r2d, 32
movdqa xmm0, [r1 + r2 + 0]
movdqa [r0 + r2 + 0], xmm0
movdqa xmm1, [r1 + r2 + 16]
movdqa [r0 + r2 + 16], xmm1
+.copy64start
+ test r2d, r2d
+ jz .ret
.copy64:
sub r2d, 64
movdqa xmm0, [r1 + r2 + 0]
@@ -1183,6 +1190,7 @@ cglobal memcpy_aligned_sse2, 3,3
movdqa xmm3, [r1 + r2 + 48]
movdqa [r0 + r2 + 48], xmm3
jg .copy64
+.ret:
REP_RET
;-----------------------------------------------------------------------------
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4f66cfc..a0ae4fc 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1276,6 +1276,44 @@ static int check_mc( int cpu_ref, int cpu_new )
report( "mbtree propagate :" );
}
+ if( mc_a.memcpy_aligned != mc_ref.memcpy_aligned )
+ {
+ set_func_name( "memcpy_aligned" );
+ ok = 1; used_asm = 1;
+ for( int size = 16; size < 256; size += 16 )
+ {
+ memset( buf4, 0xAA, size + 1 );
+ call_c( mc_c.memcpy_aligned, buf3, buf1, size );
+ call_a( mc_a.memcpy_aligned, buf4, buf1, size );
+ if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ {
+ ok = 0;
+ fprintf( stderr, "memcpy_aligned FAILED: size=%d\n", size );
+ break;
+ }
+ }
+ report( "memcpy aligned :" );
+ }
+
+ if( mc_a.memzero_aligned != mc_ref.memzero_aligned )
+ {
+ set_func_name( "memzero_aligned" );
+ ok = 1; used_asm = 1;
+ for( int size = 128; size < 1024; size += 128 )
+ {
+ memset( buf4, 0xAA, size + 1 );
+ call_c( mc_c.memzero_aligned, buf3, size );
+ call_a( mc_a.memzero_aligned, buf4, size );
+ if( memcmp( buf3, buf4, size ) || buf4[size] != 0xAA )
+ {
+ ok = 0;
+ fprintf( stderr, "memzero_aligned FAILED: size=%d\n", size );
+ break;
+ }
+ }
+ report( "memzero aligned :" );
+ }
+
return ret;
}
More information about the x264-devel
mailing list