[x265] [PATCH 213 of 307] [x265-avx512]x86: AVX512 Quant

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:31 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511857266 -19800
#      Tue Nov 28 13:51:06 2017 +0530
# Node ID 0c4c63916d6e5af144bad8280706da6f3a613e75
# Parent  8eeff916ebe608526f167177a19c4516266ba513
[x265-avx512]x86: AVX512 Quant

AVX2 Performance     :   18.77x
AVX512 Performance   :   23.83x

diff -r 8eeff916ebe6 -r 0c4c63916d6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 29 12:01:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 28 13:51:06 2017 +0530
@@ -2886,6 +2886,7 @@
         p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
         p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
+        p.quant = PFX(quant_avx512);
 
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
@@ -5007,6 +5008,7 @@
         p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
         p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
+        p.quant = PFX(quant_avx512);
 
     }
 #endif
diff -r 8eeff916ebe6 -r 0c4c63916d6e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Nov 29 12:01:16 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Nov 28 13:51:06 2017 +0530
@@ -1044,6 +1044,133 @@
 %endif ; ARCH_X86_64 == 1
 
 
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal quant, 5, 6, 22
+    ; fill qbits
+    movd            xm4, r4d            ; m4 = qbits
+
+    ; fill qbits-8
+    sub             r4d, 8
+    movd            xm6, r4d            ; m6 = qbits8
+
+    ; fill offset
+%if UNIX64 == 0
+    vpbroadcastd    m5, r5m             ; m5 = add
+%else ; Mac
+    movd           xm5, r5m
+    vpbroadcastd    m5, xm5             ; m5 = add
+%endif
+
+    vbroadcasti32x8  m9, [pw_1]
+
+    mov             r4d, r6m
+    pxor             m7, m7
+    sub             r4d, 32
+    jl              .coeff16
+    add             r4d, 32
+    shr             r4d, 5
+    jmp             .loop
+
+.coeff16:
+    ; 16 coeff
+    pxor             m7,  m7
+    pmovsxwd        m16,  [r0]            ; m16 = level
+    pabsd            m1,  m16
+    pmulld           m1,  [r1]
+    paddd           m17,   m1, m5
+    psrad           m17,  xm4             ; m17 = level1
+
+    pslld            m3,    m17, 8
+    psrad            m1,    xm6
+    psubd            m1,     m3           ; m1 = deltaU1
+    movu             [r2],   m1
+    vextracti64x4    ym19,  m17, 1
+    vextracti64x4    ym20,  m16, 1
+    psignd           ym17, ym16
+    psignd          ym19,  ym20
+    packssdw        ym17,  ym19
+    vpermq          ym17,  ym17, q3120
+    movu            [r3],  ym17
+
+    pminuw          ym17,   ym9
+    paddw           ym7,   ym17
+
+    ; sum count
+    xorpd            m0,  m0
+    psadbw          ym7, ym0
+    vextracti128    xm1, ym7, 1
+    paddd           xm7, xm1
+    movhlps         xm0, xm7
+    paddd           xm7, xm0
+    movd            eax, xm7
+    RET
+
+.loop:
+    ; 16 coeff
+    pmovsxwd        m16,   [r0]            ; m16 = level
+    pabsd            m1,   m16
+    pmulld           m1,   [r1]
+    paddd           m17,   m1,  m5
+    psrad           m17,   xm4             ; m17 = level1
+
+    pslld            m3,   m17, 8
+    psrad            m1,   xm6
+    psubd            m1,    m3             ; m1 = deltaU1
+    movu            [r2],   m1
+    vextracti64x4   ym19,  m17, 1
+    vextracti64x4   ym20,  m16, 1
+    psignd          ym17, ym16
+    psignd          ym19, ym20
+    packssdw        ym17, ym19
+
+    ; 16 coeff
+    pmovsxwd        m16,  [r0 + mmsize/2]  ; m16 = level
+    pabsd            m1,  m16
+    pmulld           m1,  [r1 + mmsize]
+    paddd           m18,   m1, m5
+    psrad           m18,  xm4              ; m2 = level1
+
+    pslld            m8,  m18, 8
+    psrad            m1,  xm6
+    psubd            m1,  m8               ; m1 = deltaU1
+    movu             [r2 + mmsize], m1
+    vextracti64x4   ym21,  m18, 1
+    vextracti64x4   ym20,  m16, 1
+    psignd          ym18, ym16
+    psignd          ym21, ym20
+    packssdw        ym18, ym21
+    vinserti64x4     m17,  m17, ym18, 1
+    vpermq           m17,  m17, q3120
+
+    movu            [r3],  m17
+
+    pminuw          m17,   m9
+    paddw            m7,  m17
+
+    add              r0,  mmsize
+    add              r1,  mmsize * 2
+    add              r2,  mmsize * 2
+    add              r3,  mmsize
+
+    dec             r4d
+    jnz            .loop
+
+    ; sum count
+    xorpd            m0,  m0
+    psadbw           m7,  m0
+    vextracti32x8   ym1,  m7, 1
+    paddd           ym7, ym1
+    vextracti64x2   xm1,  m7, 1
+    paddd           xm7, xm1
+    pshufd          xm1, xm7, 2
+    paddd           xm7, xm1
+    movd            eax, xm7
+    RET
+%endif ; ARCH_X86_64 == 1
+
+
+
 ;-----------------------------------------------------------------------------
 ; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list