[x265] [PATCH 2 of 2] asm: AVX2 version of dequant_normal, improve 9.3k Cycles -> 4.2k Cycles

Sat Sep 6 02:36:28 CEST 2014

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409963778 25200
# Node ID fa6c8db15be326900905d3a36e59b4411de1af11
# Parent  c4dd39c9ad0b96fbf520f399de41e1e9b4b77c72
asm: AVX2 version of dequant_normal, improve 9.3k Cycles -> 4.2k Cycles

diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp	Fri Sep 05 17:36:18 2014 -0700
@@ -1442,6 +1442,7 @@
     {
         p.dct[DCT_4x4] = x265_dct4_avx2;
         p.nquant = x265_nquant_avx2;
+        p.dequant_normal = x265_dequant_normal_avx2;
     }
     /* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
     for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
@@ -1739,6 +1740,7 @@
 
         p.dct[DCT_4x4] = x265_dct4_avx2;
         p.nquant = x265_nquant_avx2;
+        p.dequant_normal = x265_dequant_normal_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/const-a.asm	Fri Sep 05 17:36:18 2014 -0700
@@ -89,6 +89,7 @@
 const pd_1024,     times 4 dd 1024
 const pd_2048,     times 4 dd 2048
 const pd_ffff,     times 4 dd 0xffff
+const pd_32767,    times 4 dd 32767
 const pd_n32768,   times 4 dd 0xffff8000
 const pw_ff00,     times 8 dw 0xff00
 
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/pixel-util.h	Fri Sep 05 17:36:18 2014 -0700
@@ -48,6 +48,7 @@
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
+void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
 
 void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/pixel-util8.asm	Fri Sep 05 17:36:18 2014 -0700
@@ -54,6 +54,8 @@
 cextern pw_00ff
 cextern pw_2000
 cextern pw_pixel_max
+cextern pd_32767
+cextern pd_n32768
 
 ;-----------------------------------------------------------------------------
 ; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
@@ -1079,6 +1081,52 @@
     RET
 
 
+INIT_YMM avx2
+cglobal dequant_normal, 5,5,7
+    vpbroadcastd    m2, [pw_1]          ; m2 = word [1]
+    vpbroadcastd    m5, [pd_32767]      ; m5 = dword [32767]
+    vpbroadcastd    m6, [pd_n32768]     ; m6 = dword [-32768]
+%if HIGH_BIT_DEPTH
+    cmp             r3d, 32767
+    jle            .skip
+    shr             r3d, 2
+    sub             r4d, 2
+.skip:
+%endif
+    movd            xm0, r4d            ; m0 = shift
+    add             r4d, -1+16
+    bts             r3d, r4d
+    vpbroadcastd    m1, r3d             ; m1 = dword [add scale]
+
+    ; m0 = shift
+    ; m1 = scale
+    ; m2 = word [1]
+    shr             r2d, 4
+.loop:
+    movu            m3, [r0]
+    punpckhwd       m4, m3, m2
+    punpcklwd       m3, m2
+    pmaddwd         m3, m1              ; m3 = dword (clipQCoef * scale + add)
+    pmaddwd         m4, m1
+    psrad           m3, xm0
+    psrad           m4, xm0
+    pminsd          m3, m5
+    pmaxsd          m3, m6
+    pminsd          m4, m5
+    pmaxsd          m4, m6
+    mova            [r1 + 0 * mmsize/2], xm3
+    mova            [r1 + 1 * mmsize/2], xm4
+    vextracti128    [r1 + 2 * mmsize/2], m3, 1
+    vextracti128    [r1 + 3 * mmsize/2], m4, 1
+
+    add             r0, mmsize
+    add             r1, mmsize * 2
+
+    dec             r2d
+    jnz            .loop
+    RET
+
+
 ;-----------------------------------------------------------------------------
 ; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
 ;-----------------------------------------------------------------------------