[x265] [PATCH 219 of 307] [x265-avx512]x86: AVX512 nquant

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:37 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511947290 -19800
#      Wed Nov 29 14:51:30 2017 +0530
# Node ID 3e2058cec6c6f4ad49d92f9df7fbc110a54f4b4b
# Parent  d7af8d747bffacafa5dfe8f4d513bbd09314ad63
[x265-avx512]x86: AVX512 nquant

AVX2 Performance    :   21.42x
AVX512 Performance  :   25.60x

diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 30 15:29:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 29 14:51:30 2017 +0530
@@ -2887,8 +2887,7 @@
         p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
         p.quant = PFX(quant_avx512);
-
-
+        p.nquant = PFX(nquant_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
@@ -5015,7 +5014,7 @@
         p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
         p.quant = PFX(quant_avx512);
-
+        p.nquant = PFX(nquant_avx512);
     }
 #endif
 }
diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Nov 30 15:29:18 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Nov 29 14:51:30 2017 +0530
@@ -1277,7 +1277,101 @@
     paddd       xm5, xm0
     movd        eax, xm5
     RET
-
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal nquant, 3,5,22
+%if UNIX64 == 0
+    vpbroadcastd m4, r4m
+%else ; Mac
+    movd         xm4, r4m
+    vpbroadcastd  m4, xm4
+%endif
+
+    vbroadcasti32x8  m6, [pw_1]
+    mov         r4d, r5m
+    pxor         m5, m5
+    movd        xm3, r3m
+    sub         r4d, 16
+    je          .coeff16
+    add         r4d, 16
+    shr         r4d, 5
+    jmp         .loop
+
+.coeff16:
+    pmovsxwd         m16, [r0]
+    pabsd            m17, m16
+    pmulld           m17, [r1]
+    paddd            m17, m4
+    psrad            m17, xm3
+
+    vextracti64x4   ym19,  m17, 1
+    vextracti64x4   ym20,  m16, 1
+    psignd          ym17, ym16
+    psignd          ym19, ym20
+    packssdw        ym17, ym19
+    vpermq          ym17, ym17, q3120
+    pabsw           ym17, ym17
+    movu            [r2], ym17
+    pminuw          ym17, ym6
+    paddw           ym5,  ym17
+    pxor            m0,    m0
+    psadbw          ym5,  ym0
+    vextracti128    xm0,  ym5, 1
+    paddd           xm5,  xm0
+    pshufd          xm0,  xm5, 2
+    paddd           xm5,  xm0
+    movd            eax,  xm5
+    RET
+
+.loop:
+    pmovsxwd         m16,  [r0]
+    pabsd            m17,  m16
+    pmulld           m17,  [r1]
+    paddd            m17,  m4
+    psrad            m17,  xm3
+    vextracti64x4   ym19,  m17, 1
+    vextracti64x4   ym20,  m16, 1
+    psignd          ym17, ym16
+    psignd          ym19, ym20
+    packssdw        ym17, ym19
+
+    pmovsxwd         m16, [r0 + mmsize/2]
+    pabsd            m18, m16
+    pmulld           m18, [r1 + mmsize]
+    paddd            m18,  m4
+    psrad            m18, xm3
+    vextracti64x4   ym21,  m18, 1
+    vextracti64x4   ym20,  m16, 1
+    psignd          ym18, ym16
+    psignd          ym21, ym20
+    packssdw        ym18, ym21
+    vinserti64x4     m17,  m17, ym18, 1
+    vpermq           m17,  m17, q3120
+
+    pabsw            m17, m17
+    movu            [r2], m17
+
+    add               r0, mmsize
+    add               r1, mmsize * 2
+    add               r2, mmsize
+
+    pminuw           m17,  m6
+    paddw             m5, m17
+
+    dec         r4d
+    jnz         .loop
+
+    pxor             m0,  m0
+    psadbw           m5,  m0
+    vextracti32x8   ym1,  m5, 1
+    paddd           ym5, ym1
+    vextracti64x2   xm1,  m5, 1
+    paddd           xm5, xm1
+    pshufd          xm1, xm5, 2
+    paddd           xm5, xm1
+    movd            eax, xm5
+    RET
+%endif ; ARCH_X86_64 == 1
 
 ;-----------------------------------------------------------------------------
 ; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
diff -r d7af8d747bff -r 3e2058cec6c6 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Thu Nov 30 15:29:18 2017 +0530
+++ b/source/test/mbdstharness.cpp	Wed Nov 29 14:51:30 2017 +0530
@@ -252,12 +252,10 @@
 bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
 {
     int j = 0;
-
     for (int i = 0; i < ITERS; i++)
     {
-        int width = (rand() % 4 + 1) * 4;
+        int width = 1 << (rand() % 4 + 2);
         int height = width;
-
         uint32_t optReturnValue = 0;
         uint32_t refReturnValue = 0;
 


More information about the x265-devel mailing list