[x265] [PATCH] copy_cnt 4x4, eliminated move instructions, +1x improvement

praveen at multicorewareinc.com praveen at multicorewareinc.com
Fri Sep 5 08:04:06 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1409897037 -19800
# Node ID 1847e02a9514b02690cc4b97bae022091bf33424
# Parent  0b3f68d5f1699540c71ed7b75e2b0ca965fad82f
copy_cnt 4x4, eliminated move instructions, +1x improvement

diff -r 0b3f68d5f169 -r 1847e02a9514 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Sep 05 11:20:29 2014 +0530
+++ b/source/common/x86/blockcopy8.asm	Fri Sep 05 11:33:57 2014 +0530
@@ -3953,42 +3953,37 @@
 ; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride);
 ;--------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal copy_cnt_4, 3,3,5
+cglobal copy_cnt_4, 3,3,3
     add         r2d, r2d
-    pxor        m4, m4
+    pxor        m2, m2
 
      ; row 0 & 1
      movh        m0, [r1]
-     movh        m1, [r1 + r2]
-     movh        [r0], m0
-     movh        [r0 + 8], m1
-
-     mova        m2, [r0]
+     movhps      m0, [r1 + r2]
+     mova        [r0], m0
 
      ; row 2 & 3
-     movh        m0, [r1 + r2 * 2]
+     movh        m1, [r1 + r2 * 2]
      lea         r2, [r2 * 3]
-     movh        m1, [r1 + r2]
-     movh        [r0 + 16], m0
-     movh        [r0 + 24], m1
- 
-     mova        m0, [r0 + 16]
-     packsswb    m2, m0
-     pcmpeqb     m2, m4
+     movhps      m1, [r1 + r2]
+     mova        [r0 + 16], m1
+
+     packsswb    m0, m1
+     pcmpeqb     m0, m2
 
      ; get count
      ; CHECK_ME: Intel documents said POPCNT is SSE4.2 instruction, but just implement after Nehalem
 %if 0
-     pmovmskb    eax, m2
+     pmovmskb    eax, m0
      not         ax
      popcnt      ax, ax
 %else
-     mova        m0, [pb_1]
-     paddb       m2, m0
-     psadbw      m2, m4
-     pshufd      m0, m2, 2
-     paddw       m2, m0
-     movd        eax, m2
+     mova        m1, [pb_1]
+     paddb       m0, m1
+     psadbw      m0, m2
+     pshufd      m1, m0, 2
+     paddw       m0, m1
+     movd        eax, m0
  %endif
      RET
 


More information about the x265-devel mailing list