[x265] [PATCH 1 of 2] asm: dequant_scaling asm code, improved 12668c->11097c, 12% over intrinsic

Thu Jun 18 07:03:01 CEST 2015

# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434543335 -19800
#      Wed Jun 17 17:45:35 2015 +0530
# Node ID b977f03d9f0fb0811facc9faf926668a031b3105
# Parent  d6c32960b5df5b150569d03eb985d9772e494d13
asm: dequant_scaling asm code, improved 12668c->11097c, 12% over intrinsic

diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 17 17:45:35 2015 +0530
@@ -1104,6 +1104,7 @@
         p.quant = PFX(quant_sse4);
         p.nquant = PFX(nquant_sse4);
         p.dequant_normal = PFX(dequant_normal_sse4);
+        p.dequant_scaling = PFX(dequant_scaling_sse4);
 
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
         ALL_LUMA_PU(satd, pixel_satd, sse4);
@@ -2396,6 +2397,7 @@
         p.quant = PFX(quant_sse4);
         p.nquant = PFX(nquant_sse4);
         p.dequant_normal = PFX(dequant_normal_sse4);
+        p.dequant_scaling = PFX(dequant_scaling_sse4);
 
         p.weight_pp = PFX(weight_pp_sse4);
         p.weight_sp = PFX(weight_sp_sse4);
diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/pixel-util.h	Wed Jun 17 17:45:35 2015 +0530
@@ -31,6 +31,7 @@
     uint32_t PFX(quant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)); \
     uint32_t PFX(nquant_ ## cpu(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff)); \
     void PFX(dequant_normal_ ## cpu(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)); \
+    void PFX(dequant_scaling_## cpu(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)); \
     void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
     void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
     void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
diff -r d6c32960b5df -r b977f03d9f0f source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Jun 17 22:16:03 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Jun 17 17:45:35 2015 +0530
@@ -904,6 +904,70 @@
     jnz        .loop
     RET
 
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal dequant_scaling, 6,6,6
+    add         r5d, 4
+    shr         r3d, 3          ; num/8
+    cmp         r5d, r4d
+    jle         .skip
+    sub         r5d, r4d
+    mova        m0, [pd_1]
+    movd        m1, r5d         ; shift - per
+    dec         r5d
+    movd        m2, r5d         ; shift - per - 1
+    pslld       m0, m2          ; 1 << shift - per - 1
+
+.part0:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 8]
+    movu        m3, [r1]
+    movu        m5, [r1 + 16]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    paddd       m2, m0
+    paddd       m4, m0
+    psrad       m2, m1
+    psrad       m4, m1
+    packssdw    m2, m4
+    movu        [r2], m2
+
+    add         r0, 16
+    add         r1, 32
+    add         r2, 16
+    dec         r3d
+    jnz         .part0
+    jmp         .end
+
+.skip:
+    sub         r4d, r5d        ; per - shift
+    movd        m0, r4d
+
+.part1:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 8]
+    movu        m3, [r1]
+    movu        m5, [r1 + 16]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    packssdw    m2, m4
+    pmovsxwd    m1, m2
+    psrldq      m2, 8
+    pmovsxwd    m2, m2
+    pslld       m1, m0
+    pslld       m2, m0
+    packssdw    m1, m2
+    movu        [r2], m1
+
+    add         r0, 16
+    add         r1, 32
+    add         r2, 16
+    dec         r3d
+    jnz         .part1
+.end:
+    RET
 
 INIT_YMM avx2
 cglobal dequant_normal, 5,5,7