[x265] [PATCH 4 of 4] asm: avx2 version of quant, improve 16.6k cycles -> 8.4k cycles

Min Chen chenm003 at 163.com
Tue Sep 9 04:39:45 CEST 2014


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1410230374 25200
# Node ID 2a36ebe3ee8c48f6ef3d87f3770a64c5bae70953
# Parent  88dd9f5b8b82cc6cc6689979034c4a0635fef473
asm: avx2 version of quant, improve 16.6k cycles -> 8.4k cycles

diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/dct.cpp
--- a/source/common/dct.cpp	Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/dct.cpp	Mon Sep 08 19:39:34 2014 -0700
@@ -773,6 +773,7 @@
 uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
 {
     X265_CHECK(qBits >= 8, "qBits less than 8\n");
+    X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
     int qBits8 = qBits - 8;
     uint32_t numSig = 0;
 
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp	Mon Sep 08 19:39:34 2014 -0700
@@ -1441,6 +1441,7 @@
     if (cpuMask & X265_CPU_AVX2)
     {
         p.dct[DCT_4x4] = x265_dct4_avx2;
+        p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
     }
@@ -1739,6 +1740,7 @@
         p.denoiseDct = x265_denoise_dct_avx2;
 
         p.dct[DCT_4x4] = x265_dct4_avx2;
+        p.quant = x265_quant_avx2;
         p.nquant = x265_nquant_avx2;
         p.dequant_normal = x265_dequant_normal_avx2;
     }
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/const-a.asm	Mon Sep 08 19:39:34 2014 -0700
@@ -76,7 +76,7 @@
 const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
 const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
 const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
-const pd_1,        times 4 dd 1
+const pd_1,        times 8 dd 1
 const pd_2,        times 4 dd 2
 const pd_4,        times 4 dd 4
 const pd_8,        times 4 dd 8
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/pixel-util.h	Mon Sep 08 19:39:34 2014 -0700
@@ -45,6 +45,7 @@
 void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
 
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/pixel-util8.asm	Mon Sep 08 19:39:34 2014 -0700
@@ -929,6 +929,156 @@
     RET
 
 
+IACA_START
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal quant, 5,5,10
+    ; fill qbits
+    movd            xm4, r4d            ; m4 = qbits
+
+    ; fill qbits-8
+    sub             r4d, 8
+    movd            xm6, r4d            ; m6 = qbits8
+
+    ; fill offset
+    vpbroadcastd    m5, r5m             ; m5 = add
+
+    vpbroadcastw    m9, [pw_1]          ; m9 = word [1]
+
+    mov             r4d, r6m
+    shr             r4d, 4
+    pxor            m7, m7              ; m7 = numZero
+.loop:
+    ; 8 coeff
+    movu            m0, [r0]            ; m0 = level
+    pabsd           m1, m0
+    pmulld          m1, [r1]            ; m0 = tmpLevel1
+    paddd           m2, m1, m5
+    psrad           m2, xm4             ; m2 = level1
+
+    pslld           m3, m2, 8
+    psrad           m1, xm6
+    psubd           m1, m3              ; m1 = deltaU1
+    movu            [r2], m1
+    psignd          m2, m0
+
+    ; 8 coeff
+    movu            m0, [r0 + mmsize]   ; m0 = level
+    pabsd           m1, m0
+    pmulld          m1, [r1 + mmsize]   ; m0 = tmpLevel1
+    paddd           m3, m1, m5
+    psrad           m3, xm4             ; m2 = level1
+
+    pslld           m8, m3, 8
+    psrad           m1, xm6
+    psubd           m1, m8              ; m1 = deltaU1
+    movu            [r2 + mmsize], m1
+    psignd          m3, m0
+
+    packssdw        m2, m3
+    vpermq          m2, m2, q3120
+    movu            [r3], m2
+
+    ; count non-zero coeff
+    ; TODO: popcnt is faster, but some CPU can't support
+    pminuw          m2, m9
+    paddw           m7, m2
+
+    add             r0, mmsize*2
+    add             r1, mmsize*2
+    add             r2, mmsize*2
+    add             r3, mmsize
+
+    dec             r4d
+    jnz            .loop
+
+    ; sum count
+    xorpd           m0, m0
+    psadbw          m7, m0
+    vextracti128    xm1, m7, 1
+    paddd           xm7, xm1
+    movhlps         xm0, xm7
+    paddd           xm7, xm0
+    movd            eax, xm7
+    RET
+
+%else ; ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal quant, 5,6,8
+    ; fill qbits
+    movd            xm4, r4d        ; m4 = qbits
+
+    ; fill qbits-8
+    sub             r4d, 8
+    movd            xm6, r4d        ; m6 = qbits8
+
+    ; fill offset
+    vpbroadcastd    m5, r5m         ; m5 = ad
+
+    lea             r5, [pd_1]
+
+    mov             r4d, r6m
+    shr             r4d, 4
+    pxor            m7, m7          ; m7 = numZero
+.loop:
+    ; 8 coeff
+    movu            m0, [r0]        ; m0 = level
+    pabsd           m1, m0
+    pmulld          m1, [r1]        ; m0 = tmpLevel1
+    paddd           m2, m1, m5
+    psrad           m2, xm4         ; m2 = level1
+
+    pslld           m3, m2, 8
+    psrad           m1, xm6
+    psubd           m1, m3          ; m1 = deltaU1
+
+    movu            [r2], m1
+    psignd          m3, m2, m0
+    pminud          m2, [r5]
+    paddd           m7, m2
+    packssdw        m3, m3
+    vpermq          m3, m3, q0020
+    movu            [r3], xm3
+
+    ; 8 coeff
+    movu            m0, [r0 + mmsize]        ; m0 = level
+    pabsd           m1, m0
+    pmulld          m1, [r1 + mmsize]        ; m0 = tmpLevel1
+    paddd           m2, m1, m5
+    psrad           m2, xm4         ; m2 = level1
+
+    pslld           m3, m2, 8
+    psrad           m1, xm6
+    psubd           m1, m3          ; m1 = deltaU1
+
+    movu            [r2 + mmsize], m1
+    psignd          m3, m2, m0
+    pminud          m2, [r5]
+    paddd           m7, m2
+    packssdw        m3, m3
+    vpermq          m3, m3, q0020
+    movu            [r3 + mmsize/2], xm3
+
+    add             r0, mmsize*2
+    add             r1, mmsize*2
+    add             r2, mmsize*2
+    add             r3, mmsize
+
+    dec             r4d
+    jnz            .loop
+
+    xorpd           m0, m0
+    psadbw          m7, m0
+    vextracti128    xm1, m7, 1
+    paddd           xm7, xm1
+    movhlps         xm0, xm7
+    paddd           xm7, xm0
+    movd            eax, xm7
+    RET
+%endif ; ARCH_X86_64 == 1
+IACA_END
+
+
 ;-----------------------------------------------------------------------------
 ; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------



More information about the x265-devel mailing list