[x265] [PATCH] asm: 16bpp code for quant and dequant_normal

murugan at multicorewareinc.com murugan at multicorewareinc.com
Mon Feb 17 13:29:55 CET 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1392639418 -19800
#      Mon Feb 17 17:46:58 2014 +0530
# Node ID 85691d6c02e6f7323194ab1b054149dbf940a0d3
# Parent  85be97320422ca9682272a81f31733b3884efa02
asm: 16bpp code for quant and dequant_normal

diff -r 85be97320422 -r 85691d6c02e6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Feb 17 17:46:12 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Feb 17 17:46:58 2014 +0530
@@ -825,8 +825,10 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+
+        p.quant = x265_quant_sse4;
+        p.dequant_normal = x265_dequant_normal_sse4;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
-
         p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
         p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;
         p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;
diff -r 85be97320422 -r 85691d6c02e6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Feb 17 17:46:12 2014 +0530
+++ b/source/common/x86/pixel-util8.asm	Mon Feb 17 17:46:58 2014 +0530
@@ -1071,20 +1071,19 @@
     mova        m4, [c_d_4]     ; m4 = [4 4 4 4]
 .loop:
     ; 4 coeff
-    movu        m0, [r0]        ; m1 = level
+    movu        m0, [r0]        ; m0 = level
     pxor        m1, m1
-    pcmpgtd     m1, m0          ; m2 = sign
-    movu        m2, [r1]        ; m3 = qcoeff
+    pcmpgtd     m1, m0          ; m1 = sign
+    movu        m2, [r1]        ; m2 = qcoeff
     pabsd       m0, m0
-    pmulld      m0, m2          ; m1 = tmpLevel1
+    pmulld      m0, m2          ; m0 = tmpLevel1
     paddd       m2, m0, addVec
-    psrad       m2, qbits       ; m3 = level1
+    psrad       m2, qbits       ; m2 = level1
     paddd       m7, m2
     pslld       m3, m2, qbits
     psubd       m0, m3
-    psrad       m0, qbits8      ; m1 = deltaU1
+    psrad       m0, qbits8      ; m0 = deltaU1
     movu        [r2], m0
-
     pxor        m0, m0
     pcmpeqd     m0, m2          ; m0 = mask4
     pand        m5, m0
@@ -1097,22 +1096,20 @@
     packssdw    m2, m2
     pmovsxwd    m2, m2
     movu        [r3], m2
-
     ; 4 coeff
-    movu        m0, [r0 + 16]   ; m1 = level
+    movu        m0, [r0 + 16]   ; m0 = level
     pxor        m1, m1
-    pcmpgtd     m1, m0          ; m2 = sign
-    movu        m2, [r1 + 16]   ; m3 = qcoeff
+    pcmpgtd     m1, m0          ; m1 = sign
+    movu        m2, [r1 + 16]   ; m2 = qcoeff
     pabsd       m0, m0
-    pmulld      m0, m2          ; m1 = tmpLevel1
+    pmulld      m0, m2          ; m0 = tmpLevel1
     paddd       m2, m0, addVec
-    psrad       m2, qbits       ; m3 = level1
+    psrad       m2, qbits       ; m2 = level1
     paddd       m7, m2
     pslld       m3, m2, qbits
     psubd       m0, m3
-    psrad       m0, qbits8      ; m1 = deltaU1
+    psrad       m0, qbits8      ; m0 = deltaU1
     movu        [r2 + 16], m0
-
     pxor        m0, m0
     pcmpeqd     m0, m2          ; m0 = mask4
     pand        m5, m0
@@ -1154,8 +1151,11 @@
 ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal dequant_normal, 2,5,8
-    movd        m1, r3m             ; m1 = word [scale]
+cglobal dequant_normal, 4,5,5
+    movd        m1, r3             ; m1 = word [scale]
+    cmp         r3d, 255
+    jle         .skip
+    psrld       m1, 2
     mov         r4d, r4m
     movd        m0, r4d             ; m0 = shift
     xor         r3d, r3d
@@ -1174,6 +1174,45 @@
     movu        m3, [r0]
     movu        m4, [r0 + 16]
     packssdw    m3, m4              ; m3 = clipQCoef
+    psllw       m3, 2
+    punpckhwd   m4, m3, m2
+    punpcklwd   m3, m2
+    pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
+    pmaddwd     m4, m1
+    psrad       m3, m0
+    psrad       m4, m0
+    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
+    pmovsxwd    m3, m3
+    packssdw    m4, m4
+    pmovsxwd    m4, m4
+    movu        [r1], m3
+    movu        [r1 + 16], m4
+
+    add         r0, 32
+    add         r1, 32
+
+    sub         r2d, 8
+    jnz        .loop
+    jz         .end
+
+.skip:
+    mov         r4d, r4m
+    movd        m0, r4d             ; m0 = shift
+    xor         r3d, r3d
+    dec         r4d
+    bts         r3d, r4d
+    movd        m2, r3d
+    punpcklwd   m1, m2
+    pshufd      m1, m1, 0           ; m1 = dword [add scale]
+    mova        m2, [pw_1]
+    mov         r2d, r2m
+    ; m0 = shift
+    ; m1 = scale
+    ; m2 = word [1]
+.sloop:
+    movu        m3, [r0]
+    movu        m4, [r0 + 16]
+    packssdw    m3, m4              ; m3 = clipQCoef
     punpckhwd   m4, m3, m2
     punpcklwd   m3, m2
     pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
@@ -1191,7 +1230,8 @@
     add         r1, 32
 
     sub         r2d, 8
-    jnz        .loop
+    jnz        .sloop
+.end:
     RET
 
 ;-----------------------------------------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list