[x264-devel] commit: A few tweaks to decimate asm (Jason Garrett-Glaser )

Fri Nov 21 05:17:11 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Thu Nov 20 20:11:14 2008 -0800| [cb3c213850320fb0c1b17ae8bbbbf5d687e43961] | committer: Jason Garrett-Glaser 

A few tweaks to decimate asm
A little bit faster on both 32-bit and 64-bit

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=cb3c213850320fb0c1b17ae8bbbbf5d687e43961
---

 common/x86/quant-a.asm |   31 +++++++++++++++++--------------
 tools/checkasm.c       |   10 +++++-----
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index f89eaf6..80cf5b5 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -521,16 +521,14 @@ cglobal x264_decimate_score64_%1, 1,4
     DECIMATE_MASK r2d, r3d, r0+96, m7, %1, null
     shl   r2, 48
     or    r1, r2
-    not   r1
-    test  r1, r1
+    xor   r1, -1
     je   .ret
     or    eax, r3d
     jne  .ret9
 .loop:
     bsf   rcx, r1
     shr   r1, cl
-    movzx ecx, byte [table + rcx]
-    add   eax, ecx
+    add   al, byte [table + rcx]
     shr   r1, 1
     jne  .loop
 .ret:
@@ -557,28 +555,33 @@ cglobal x264_decimate_score64_%1, 1,5
     DECIMATE_MASK r1, r0, r0+96, m7, %1, r5
     shl   r1, 16
     or    r4, r1
-    not   r3
-    not   r4
-    mov   r1, r3
-    or    r1, r4
-    je   .ret
+    xor   r3, -1
+    je   .tryret
+    xor   r4, -1
+.cont
     or    r0, r2
-    jne  .ret9    ;r2 is zero at this point, so we don't need to zero it
+    jne  .ret9      ;r0 is zero at this point, so we don't need to zero it
 .loop:
     bsf   ecx, r3
     test  r3, r3
     je   .largerun
     shrd  r3, r4, cl
     shr   r4, cl
-    movzx ecx, byte [x264_decimate_table8 + ecx]
-    add   r0, ecx
+    add   r0b, byte [x264_decimate_table8 + ecx]
     shrd  r3, r4, 1
     shr   r4, 1
-    mov   r2, r3
-    or    r2, r4
+    cmp   r0, 6     ;score64's threshold is never higher than 6
+    jge  .ret9      ;this early termination is only useful on 32-bit because it can be done in the latency after shrd
+    test  r3, r3
+    jne  .loop
+    test  r4, r4
     jne  .loop
 .ret:
     REP_RET
+.tryret:
+    xor   r4, -1
+    jne  .cont
+    REP_RET
 .ret9:
     mov   eax, 9
     RET
diff --git a/tools/checkasm.c b/tools/checkasm.c
index f293bb0..675817d 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1078,7 +1078,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     }
     report( "denoise dct :" );
 
-#define TEST_DECIMATE( qname, decname, block, w, ac ) \
+#define TEST_DECIMATE( qname, decname, block, w, ac, thresh ) \
     if( qf_a.decname != qf_ref.decname ) \
     { \
         set_func_name( #decname ); \
@@ -1093,7 +1093,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             memcpy( dct2, dct1, w*w*2 ); \
             result_c = call_c1( qf_c.decname, (void*)dct2 ); \
             result_a = call_a1( qf_a.decname, (void*)dct2 ); \
-            if( result_c != result_a ) \
+            if( X264_MIN(result_c,thresh) != X264_MIN(result_a,thresh) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #decname ": [FAILED]\n" ); \
@@ -1104,9 +1104,9 @@ static int check_quant( int cpu_ref, int cpu_new )
         } \
     }
 
-    TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0 );
-    TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0 );
-    TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1 );
+    TEST_DECIMATE( quant_8x8, decimate_score64, CQM_8IY, 8, 0, 6 );
+    TEST_DECIMATE( quant_4x4, decimate_score16, CQM_4IY, 4, 0, 6 );
+    TEST_DECIMATE( quant_4x4, decimate_score15, CQM_4IY, 4, 1, 7 );
     report( "decimate_score :" );
 
     return ret;