[x265] [PATCH 083 of 307] [x265-avx512]x86: AVX512 weight_pp

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:21 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1502442378 -19800
#      Fri Aug 11 14:36:18 2017 +0530
# Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a
# Parent  b30539ebe5c9b2d9412d3a39458a90a7574ac744
[x265-avx512]x86: AVX512 weight_pp

BitDepth | AVX2 performance | AVX512 performance
------------------------------------------------
  8      |     6.23x        |       10.60x
  10     |     9.43x        |       14.59x

diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Aug 11 14:36:18 2017 +0530
@@ -2322,6 +2322,7 @@
         p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512);
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
+        p.weight_pp = PFX(weight_pp_avx512);
 
     }
 }
@@ -4026,6 +4027,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512);
+        p.weight_pp = PFX(weight_pp_avx512);
 
     }
 #endif
diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Aug 14 17:19:48 2017 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri Aug 11 14:36:18 2017 +0530
@@ -1662,6 +1662,116 @@
     jnz         .loopH
     RET
 %endif
+
+%if HIGH_BIT_DEPTH
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 7
+%define correction      (14 - BIT_DEPTH)
+    mov          r6d, r6m
+    shl          r6d, 16 - correction
+    or           r6d, r5d
+
+    movd         xm0, r6d
+    vpbroadcastd  m0, xm0
+    mov          r5d, r7m
+    sub          r5d, correction
+    movd         xm1, r5d
+
+    vpbroadcastd    m2, r8m
+    vbroadcasti32x8 m5, [pw_1]
+    vbroadcasti32x8 m6, [pw_pixel_max]
+
+    add         r2d, r2d
+    add         r3d, r3d
+    sub         r2d, r3d
+    shr         r3d, 6
+
+.loopH:
+    mov          r5d, r3d
+
+.loopW:
+    movu        m4, [r0]
+    punpcklwd   m3, m4, m5
+    pmaddwd     m3, m0
+    psrad       m3, xm1
+    paddd       m3, m2
+
+    punpckhwd   m4, m5
+    pmaddwd     m4, m0
+    psrad       m4, xm1
+    paddd       m4, m2
+
+    packusdw    m3,   m4
+    pminuw      m3,   m6
+    movu        [r1], m3
+
+    add         r0, 64
+    add         r1, 64
+
+    dec         r5d
+    jnz         .loopW
+
+    lea         r0, [r0 + r2]
+    lea         r1, [r1 + r2]
+
+    dec         r4d
+    jnz         .loopH
+%undef correction
+    RET
+%else
+INIT_ZMM avx512
+cglobal weight_pp, 6, 7, 6
+
+    shl          r5d, 6
+    mov          r6d, r6m
+    shl          r6d, 16
+    or           r6d, r5d
+
+    movd         xm0, r6d
+    vpbroadcastd  m0, xm0
+    movd         xm1, r7m
+    vpbroadcastd  m2, r8m
+
+    vbroadcasti32x8 m5, [pw_1]
+
+    sub          r2d, r3d
+    shr          r3d, 5
+
+.loopH:
+    mov          r5d, r3d
+
+.loopW:
+    pmovzxbw    m4, [r0]
+    punpcklwd   m3, m4, m5
+    pmaddwd     m3, m0
+    psrad       m3, xm1
+    paddd       m3, m2
+
+    punpckhwd   m4, m5
+    pmaddwd     m4, m0
+    psrad       m4, xm1
+    paddd       m4, m2
+
+    packssdw       m3,  m4
+    vextracti64x4 ym4,  m3, 1
+    packuswb      ym3,  ym4
+    vpermq        ym3,  ym3, q3120
+    movu          [r1], ym3
+
+    add         r0, 32
+    add         r1, 32
+
+    dec         r5d
+    jnz         .loopW
+
+    lea         r0, [r0 + r2]
+    lea         r1, [r1 + r2]
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endif
+
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp
--- a/source/encoder/reference.cpp	Mon Aug 14 17:19:48 2017 +0530
+++ b/source/encoder/reference.cpp	Fri Aug 11 14:36:18 2017 +0530
@@ -155,12 +155,10 @@
 
         const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
         pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
-
         // Computing weighted CU rows
         int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
-        int padwidth = (width + 15) & ~15;              // weightp assembly needs even 16 byte widths
+        int padwidth = (width + 31) & ~31;              // weightp assembly needs even 32 byte widths
         primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
-
         // Extending Left & Right
         primitives.extendRowBorder(dst, stride, width, height, marginX);
 
diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/weightPrediction.cpp
--- a/source/encoder/weightPrediction.cpp	Mon Aug 14 17:19:48 2017 +0530
+++ b/source/encoder/weightPrediction.cpp	Fri Aug 11 14:36:18 2017 +0530
@@ -184,8 +184,7 @@
         int denom = w->log2WeightDenom;
         int round = denom ? 1 << (denom - 1) : 0;
         int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
-        int pwidth = ((width + 15) >> 4) << 4;
-
+        int pwidth = ((width + 31) >> 5) << 5;
         primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
                              weight, round << correction, denom + correction, offset);
         ref = weightTemp;
diff -r b30539ebe5c9 -r 3d8c45642752 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Aug 14 17:19:48 2017 +0530
+++ b/source/test/pixelharness.cpp	Fri Aug 11 14:36:18 2017 +0530
@@ -291,6 +291,9 @@
     memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
     int j = 0;
     int width = 16 * (rand() % 4 + 1);
+    int cpuid = X265_NS::cpu_detect();
+    if (cpuid & X265_CPU_AVX512)
+        width = 32 * (rand() % 2 + 1);
     int height = 8;
     int w0 = rand() % 128;
     int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset()


More information about the x265-devel mailing list