[x265] asm: fix dequant_normal

Satoshi Nakagawa nakagawa424 at oki.com
Sat Aug 30 08:01:11 CEST 2014


> How about remove '#if...'?
> The asm code didn't check it.

added '%if...' to asm code :)


# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1409378187 -32400
#      Sat Aug 30 14:56:27 2014 +0900
# Node ID c4f15840feb443f8c38ba58b52ef5ba6d518e626
# Parent  4e2d9ac6d489e82e70544d626c89964ee653c452
asm: fix dequant_normal

diff -r 4e2d9ac6d489 -r c4f15840feb4 source/common/dct.cpp
--- a/source/common/dct.cpp	Fri Aug 29 11:12:49 2014 +0200
+++ b/source/common/dct.cpp	Sat Aug 30 14:56:27 2014 +0900
@@ -720,7 +720,9 @@
 
 void dequant_normal_c(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 {
-#if !HIGH_BIT_DEPTH
+#if HIGH_BIT_DEPTH
+    X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
+#else
     // NOTE: maximum of scale is (72 * 256)
     X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
 #endif
diff -r 4e2d9ac6d489 -r c4f15840feb4 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Aug 29 11:12:49 2014 +0200
+++ b/source/common/x86/pixel-util8.asm	Sat Aug 30 14:56:27 2014 +0900
@@ -1005,23 +1005,23 @@
 ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal dequant_normal, 4,5,5
-    movd        m1, r3             ; m1 = word [scale]
+cglobal dequant_normal, 5,5,5
+    movd        m1, r3              ; m1 = word [scale]
+    mova        m2, [pw_1]
+%if HIGH_BIT_DEPTH
     cmp         r3d, 32767
     jle         .skip
-
     psrld       m1, 2
-    mov         r4d, r4m
+    sub         r4d, 2
+.skip:
+%endif
     movd        m0, r4d             ; m0 = shift
     xor         r3d, r3d
     dec         r4d
     bts         r3d, r4d
-    movd        m2, r3d
-    punpcklwd   m1, m2
+    movd        m3, r3d
+    punpcklwd   m1, m3
     pshufd      m1, m1, 0           ; m1 = dword [add scale]
-    mova        m2, [pw_1]
-    mov         r2d, r2m
-
     ; m0 = shift
     ; m1 = scale
     ; m2 = word [1]
@@ -1029,45 +1029,6 @@
     movu        m3, [r0]
     movu        m4, [r0 + 16]
     packssdw    m3, m4              ; m3 = clipQCoef
-    psllw       m3, 2
-    punpckhwd   m4, m3, m2
-    punpcklwd   m3, m2
-    pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
-    pmaddwd     m4, m1
-    psrad       m3, m0
-    psrad       m4, m0
-    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
-    pmovsxwd    m3, m3
-    packssdw    m4, m4
-    pmovsxwd    m4, m4
-    movu        [r1], m3
-    movu        [r1 + 16], m4
-
-    add         r0, 32
-    add         r1, 32
-
-    sub         r2d, 8
-    jnz        .loop
-    jz         .end
-
-.skip:
-    mov         r4d, r4m
-    movd        m0, r4d             ; m0 = shift
-    xor         r3d, r3d
-    dec         r4d
-    bts         r3d, r4d
-    movd        m2, r3d
-    punpcklwd   m1, m2
-    pshufd      m1, m1, 0           ; m1 = dword [add scale]
-    mova        m2, [pw_1]
-    mov         r2d, r2m
-    ; m0 = shift
-    ; m1 = scale
-    ; m2 = word [1]
-.sloop:
-    movu        m3, [r0]
-    movu        m4, [r0 + 16]
-    packssdw    m3, m4              ; m3 = clipQCoef
     punpckhwd   m4, m3, m2
     punpcklwd   m3, m2
     pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
@@ -1085,8 +1046,7 @@
     add         r1, 32
 
     sub         r2d, 8
-    jnz        .sloop
-.end:
+    jnz        .loop
     RET
 
 


More information about the x265-devel mailing list