[x265] [PATCH 3 of 3] asm: avx2 version of nquant(), improve 11k cycles -> 7k cycles

Thu Sep 4 01:37:49 CEST 2014

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409787455 25200
# Node ID 8d109ec524a7767925084049a77668d004a5b319
# Parent  4ca9e972f48cb4530ca7181ad7cec351568a99b3
asm: avx2 version of nquant(), improve 11k cycles -> 7k cycles

diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/dct.cpp

--- a/source/common/dct.cpp	Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/dct.cpp	Wed Sep 03 16:37:35 2014 -0700
@@ -803,7 +803,7 @@
 
     X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
     X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
-    X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quantCoeff buffer not aligned\n");
+    X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
 
     for (int blockpos = 0; blockpos < numCoeff; blockpos++)
     {
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 03 16:37:35 2014 -0700
@@ -1432,6 +1432,7 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.nquant = x265_nquant_avx2;
     }
 
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
@@ -1715,6 +1716,7 @@
         p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
         p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
         p.denoiseDct = x265_denoise_dct_avx2;
+        p.nquant = x265_nquant_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/const-a.asm	Wed Sep 03 16:37:35 2014 -0700
@@ -86,7 +86,8 @@
 const pd_1024,     times 4 dd 1024
 const pd_2048,     times 4 dd 2048
 const pd_ffff,     times 4 dd 0xffff
-const pd_n32768,   times 4 dd 0xffff8000
+const pd_32767,    times 8 dd 32767
+const pd_n32768,   times 8 dd 0xffff8000
 const pw_ff00,     times 8 dw 0xff00
 
 const multi_2Row,  dw 1, 2, 3, 4, 1, 2, 3, 4
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/pixel-util.h	Wed Sep 03 16:37:35 2014 -0700
@@ -46,6 +46,7 @@
 
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
 
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/pixel-util8.asm	Wed Sep 03 16:37:35 2014 -0700
@@ -54,6 +54,8 @@
 cextern pw_00ff
 cextern pw_2000
 cextern pw_pixel_max
+cextern pd_32767
+cextern pd_n32768
 
 ;-----------------------------------------------------------------------------
 ; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
@@ -991,6 +993,61 @@
     sub         eax, r4d        ; numSig
     RET
 
+
+INIT_YMM avx2
+cglobal nquant, 3,5,8
+    vpbroadcastd m6, r4m
+    mov         r4d, r5m
+    movd        xm5, r3m            ; m5 = qbits
+    mov         r3d, r4d            ; r3 = numCoeff
+    shr         r4d, 4
+    pxor        m7, m7              ; m7 = numZero
+
+.loop:
+    movu        m0, [r0]            ; m0 = level
+    pabsd       m1, m0
+    pmulld      m1, [r1]            ; m1 = tmpLevel1
+    paddd       m1, m6
+    psrad       m1, xm5             ; m1 = level1
+    psignd      m1, m0              ; restore sign
+
+    movu        m2, [r0 + mmsize]   ; m2 = level
+    pabsd       m3, m2
+    pmulld      m3, [r1 + mmsize]   ; m3 = tmpLevel1
+    paddd       m3, m6
+    psrad       m3, xm5             ; m3 = level1
+    psignd      m3, m2              ; restore sign
+
+    add         r0, 2 * mmsize
+    add         r1, 2 * mmsize
+
+    packssdw    m0, m1, m3
+
+    pminsd      m2, m1, [pd_32767]
+    pmaxsd      m2, m2, [pd_n32768]
+    movu        [r2], m2
+
+    pminsd      m2, m3, [pd_32767]
+    pmaxsd      m2, m2, [pd_n32768]
+    movu        [r2 + mmsize], m2
+    add         r2, 2 * mmsize
+
+    pxor        m4, m4
+    pcmpeqw     m0, m4
+    psubw       m7, m0
+
+    dec         r4d
+    jnz         .loop
+
+    vextracti128 xm0, m7, 1
+    paddw       xm7, xm0
+    packuswb    xm7, xm7
+    psadbw      xm7, xm4
+    mov         eax, r3d
+    movd        r4d, xm7
+    sub         eax, r4d        ; numSig
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 ;-----------------------------------------------------------------------------