[x264-devel] commit: A few tweaks to decimate asm (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Nov 21 05:17:11 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Nov 20 20:11:14 2008 -0800| [cb3c213850320fb0c1b17ae8bbbbf5d687e43961] | committer: Jason Garrett-Glaser
A few tweaks to decimate asm
A little bit faster on both 32-bit and 64-bit
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=cb3c213850320fb0c1b17ae8bbbbf5d687e43961
---
common/x86/quant-a.asm | 31 +++++++++++++++++--------------
tools/checkasm.c | 10 +++++-----
2 files changed, 22 insertions(+), 19 deletions(-)
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index f89eaf6..80cf5b5 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -521,16 +521,14 @@ cglobal x264_decimate_score64_%1, 1,4
DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
shl r2, 48
or r1, r2
- not r1
- test r1, r1
+ xor r1, -1
je .ret
or eax, r3d
jne .ret9
.loop:
bsf rcx, r1
shr r1, cl
- movzx ecx, byte [table + rcx]
- add eax, ecx
+ add al, byte [table + rcx]
shr r1, 1
jne .loop
.ret:
@@ -557,28 +555,33 @@ cglobal x264_decimate_score64_%1, 1,5
DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
shl r1, 16
or r4, r1
- not r3
- not r4
- mov r1, r3
- or r1, r4
- je .ret
+ xor r3, -1
+ je .tryret
+ xor r4, -1
+.cont
or r0, r2
- jne .ret9 ;r2 is zero at this point, so we don't need to zero it
+ jne .ret9 ;r0 is zero at this point, so we don't need to zero it
.loop:
bsf ecx, r3
test r3, r3
je .largerun
shrd r3, r4, cl
shr r4, cl
- movzx ecx, byte [x264_decimate_table8 + ecx]
- add r0, ecx
+ add r0b, byte [x264_decimate_table8 + ecx]
shrd r3, r4, 1
shr r4, 1
- mov r2, r3
- or r2, r4
+ cmp r0, 6 ;score64's threshold is never higher than 6
+ jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
+ test r3, r3
+ jne .loop
+ test r4, r4
jne .loop
.ret:
REP_RET
+.tryret:
+ xor r4, -1
+ jne .cont
+ REP_RET
.ret9:
mov eax, 9
RET
diff --git a/tools/checkasm.c b/tools/checkasm.c
index f293bb0..675817d 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1078,7 +1078,7 @@ static int check_quant( int cpu_ref, int cpu_new )
}
report( "denoise dct :" );
-#define TEST_DECIMATE( qname, decname, block, w, ac ) \
+#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \
if( qf_a.decname != qf_ref.decname ) \
{ \
set_func_name( #decname ); \
@@ -1093,7 +1093,7 @@ static int check_quant( int cpu_ref, int cpu_new )
memcpy( dct2, dct1, w*w*2 ); \
result_c = call_c1( qf_c.decname, (void*)dct2 ); \
result_a = call_a1( qf_a.decname, (void*)dct2 ); \
- if( result_c != result_a ) \
+ if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
{ \
ok = 0; \
fprintf( stderr, #decname ": [FAILED]\n" ); \
@@ -1104,9 +1104,9 @@ static int check_quant( int cpu_ref, int cpu_new )
} \
}
- TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 );
- TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 );
- TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 );
+ TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
+ TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
+ TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );
report( "decimate_score :" );
return ret;
More information about the x264-devel
mailing list