[x265] asm: fix dequant_normal

Satoshi Nakagawa nakagawa424 at oki.com
Sat Aug 30 06:39:07 CEST 2014


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1409373356 -32400
#      Sat Aug 30 13:35:56 2014 +0900
# Node ID 9b5f0c75d052e963b0a413f341a74036141b3675
# Parent  4e2d9ac6d489e82e70544d626c89964ee653c452
asm: fix dequant_normal

diff -r 4e2d9ac6d489 -r 9b5f0c75d052 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Aug 29 11:12:49 2014 +0200
+++ b/source/common/x86/pixel-util8.asm	Sat Aug 30 13:35:56 2014 +0900
@@ -1005,23 +1005,21 @@
 ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal dequant_normal, 4,5,5
-    movd        m1, r3             ; m1 = word [scale]
+cglobal dequant_normal, 5,5,5
     cmp         r3d, 32767
+    movd        m1, r3              ; m1 = word [scale]
+    mova        m2, [pw_1]
     jle         .skip
-
     psrld       m1, 2
-    mov         r4d, r4m
+    sub         r4d, 2
+.skip:
     movd        m0, r4d             ; m0 = shift
     xor         r3d, r3d
     dec         r4d
     bts         r3d, r4d
-    movd        m2, r3d
-    punpcklwd   m1, m2
+    movd        m3, r3d
+    punpcklwd   m1, m3
     pshufd      m1, m1, 0           ; m1 = dword [add scale]
-    mova        m2, [pw_1]
-    mov         r2d, r2m
-
     ; m0 = shift
     ; m1 = scale
     ; m2 = word [1]
@@ -1029,45 +1027,6 @@
     movu        m3, [r0]
     movu        m4, [r0 + 16]
     packssdw    m3, m4              ; m3 = clipQCoef
-    psllw       m3, 2
-    punpckhwd   m4, m3, m2
-    punpcklwd   m3, m2
-    pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
-    pmaddwd     m4, m1
-    psrad       m3, m0
-    psrad       m4, m0
-    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
-    pmovsxwd    m3, m3
-    packssdw    m4, m4
-    pmovsxwd    m4, m4
-    movu        [r1], m3
-    movu        [r1 + 16], m4
-
-    add         r0, 32
-    add         r1, 32
-
-    sub         r2d, 8
-    jnz        .loop
-    jz         .end
-
-.skip:
-    mov         r4d, r4m
-    movd        m0, r4d             ; m0 = shift
-    xor         r3d, r3d
-    dec         r4d
-    bts         r3d, r4d
-    movd        m2, r3d
-    punpcklwd   m1, m2
-    pshufd      m1, m1, 0           ; m1 = dword [add scale]
-    mova        m2, [pw_1]
-    mov         r2d, r2m
-    ; m0 = shift
-    ; m1 = scale
-    ; m2 = word [1]
-.sloop:
-    movu        m3, [r0]
-    movu        m4, [r0 + 16]
-    packssdw    m3, m4              ; m3 = clipQCoef
     punpckhwd   m4, m3, m2
     punpcklwd   m3, m2
     pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
@@ -1085,8 +1044,7 @@
     add         r1, 32
 
     sub         r2d, 8
-    jnz        .sloop
-.end:
+    jnz        .loop
     RET
 
 


More information about the x265-devel mailing list