[x265] [PATCH 2 of 2] asm: avx2 code for dequant_scaling, improved 11097c->6860c, 38% over SSE4

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu Jun 18 07:03:02 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434602517 -19800
#      Thu Jun 18 10:11:57 2015 +0530
# Node ID 2b69ad2ac441d0782fef529d5c412edd43a0093e
# Parent  b977f03d9f0fb0811facc9faf926668a031b3105
asm: avx2 code for dequant_scaling, improved 11097c->6860c, 38% over SSE4

diff -r b977f03d9f0f -r 2b69ad2ac441 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 17 17:45:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 18 10:11:57 2015 +0530
@@ -1460,6 +1460,7 @@
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal  = PFX(dequant_normal_avx2);
+        p.dequant_scaling = PFX(dequant_scaling_avx2);
         p.dst4x4 = PFX(dst4_avx2);
         p.idst4x4 = PFX(idst4_avx2);
         p.denoiseDct = PFX(denoise_dct_avx2);
@@ -2768,6 +2769,7 @@
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal = PFX(dequant_normal_avx2);
+        p.dequant_scaling = PFX(dequant_scaling_avx2);
 
         p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r b977f03d9f0f -r 2b69ad2ac441 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Jun 17 17:45:35 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Thu Jun 18 10:11:57 2015 +0530
@@ -969,6 +969,72 @@
 .end:
     RET
 
+;----------------------------------------------------------------------------------------------------------------------
+;void dequant_scaling(const int16_t* src, const int32_t* dequantCoef, int16_t* dst, int num, int mcqp_miper, int shift)
+;----------------------------------------------------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal dequant_scaling, 6,6,6
+    add         r5d, 4
+    shr         r3d, 4          ; num/16
+    cmp         r5d, r4d
+    jle         .skip
+    sub         r5d, r4d
+    mova        m0, [pd_1]
+    movd        xm1, r5d         ; shift - per
+    dec         r5d
+    movd        xm2, r5d         ; shift - per - 1
+    pslld       m0, xm2          ; 1 << shift - per - 1
+
+.part0:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 16]
+    movu        m3, [r1]
+    movu        m5, [r1 + 32]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    paddd       m2, m0
+    paddd       m4, m0
+    psrad       m2, xm1
+    psrad       m4, xm1
+    packssdw    m2, m4
+    vpermq      m2, m2, 11011000b
+    movu        [r2], m2
+
+    add         r0, 32
+    add         r1, 64
+    add         r2, 32
+    dec         r3d
+    jnz         .part0
+    jmp         .end
+
+.skip:
+    sub         r4d, r5d        ; per - shift
+    movd        xm0, r4d
+
+.part1:
+    pmovsxwd    m2, [r0]
+    pmovsxwd    m4, [r0 + 16]
+    movu        m3, [r1]
+    movu        m5, [r1 + 32]
+    pmulld      m2, m3
+    pmulld      m4, m5
+    packssdw    m2, m4
+    vextracti128 xm4, m2, 1
+    pmovsxwd    m1, xm2
+    pmovsxwd    m2, xm4
+    pslld       m1, xm0
+    pslld       m2, xm0
+    packssdw    m1, m2
+    movu        [r2], m1
+
+    add         r0, 32
+    add         r1, 64
+    add         r2, 32
+    dec         r3d
+    jnz         .part1
+.end:
+    RET
+
 INIT_YMM avx2
 cglobal dequant_normal, 5,5,7
     vpbroadcastd    m2, [pw_1]          ; m2 = word [1]


More information about the x265-devel mailing list