[x264-devel] x86: Faster mbtree_fix8_unpack

Mon Dec 25 20:39:55 CET 2017

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Oct  7 11:34:16 2017 +0200| [08476ab1c0a9b741198677731373b173657fa079] | committer: Anton Mitrofanov

x86: Faster mbtree_fix8_unpack

Use a different multiplier in order to eliminate some shifts.

About 25% faster than before.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=08476ab1c0a9b741198677731373b173657fa079
---

 common/x86/mc-a2.asm | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index e93cfccc..c437f5de 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -66,8 +66,8 @@ mbtree_fix8_unpack_shuf: db -1,-1, 1, 0,-1,-1, 3, 2,-1,-1, 5, 4,-1,-1, 7, 6
                          db -1,-1, 9, 8,-1,-1,11,10,-1,-1,13,12,-1,-1,15,14
 mbtree_fix8_pack_shuf:   db  1, 0, 3, 2, 5, 4, 7, 6, 9, 8,11,10,13,12,15,14
 
-pf_256:    times 4 dd 256.0
-pf_inv256: times 4 dd 0.00390625
+pf_256:         times 4 dd 256.0
+pf_inv16777216: times 4 dd 0x1p-24
 
 pd_16: times 4 dd 16
 
@@ -2589,9 +2589,9 @@ cglobal mbtree_fix8_pack, 3,4
 ;-----------------------------------------------------------------------------
 cglobal mbtree_fix8_unpack, 3,4
 %if mmsize == 32
-    vbroadcastf128 m2, [pf_inv256]
+    vbroadcastf128 m2, [pf_inv16777216]
 %else
-    movaps       m2, [pf_inv256]
+    movaps       m2, [pf_inv16777216]
     mova         m4, [mbtree_fix8_unpack_shuf+16]
 %endif
     mova         m3, [mbtree_fix8_unpack_shuf]
@@ -2612,8 +2612,6 @@ cglobal mbtree_fix8_unpack, 3,4
     pshufb       m0, m1, m3
     pshufb       m1, m4
 %endif
-    psrad        m0, 16 ; sign-extend
-    psrad        m1, 16
     cvtdq2ps     m0, m0
     cvtdq2ps     m1, m1
     mulps        m0, m2
@@ -2627,8 +2625,7 @@ cglobal mbtree_fix8_unpack, 3,4
     jz .end
 .scalar:
     movzx       r3d, word [r1+2*r2+mmsize]
-    rol         r3w, 8
-    movsx       r3d, r3w
+    bswap       r3d
     ; Use 3-arg cvtsi2ss as a workaround for the fact that the instruction has a stupid dependency on
     ; dst which causes terrible performance when used in a loop otherwise. Blame Intel for poor design.
     cvtsi2ss    xm0, xm2, r3d