[x265] [PATCH 096 of 307] x86: AVX512 copy_cnt_32 and copy_cnt_16

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:31:34 CEST 2018


# HG changeset patch
# User Vignesh Vijayakumar
# Date 1503557407 -19800
#      Thu Aug 24 12:20:07 2017 +0530
# Node ID 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47
# Parent  31a180bcef33fae436ad7e3aa4378b283a86d56a
x86: AVX512 copy_cnt_32 and copy_cnt_16

Size | BitDepth | AVX2 performance | AVX512 performance
-------------------------------------------------------
16x16|    8     |     6.92x        |       8.07x
16x16|    10    |     6.72x        |       7.75x
32x32|    8     |     6.08x        |      10.33x
32x32|    10    |     6.04x        |      10.16x

diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Aug 24 12:20:07 2017 +0530
@@ -2342,6 +2342,9 @@
         p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
         p.weight_pp = PFX(weight_pp_avx512);
 
+        p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+        p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -4054,6 +4057,9 @@
         p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512);
         p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512);
 
+        p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512);
+        p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512);
+
         //i444 chroma_hpp
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512);
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512);
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Thu Aug 24 12:20:07 2017 +0530
@@ -5958,7 +5958,91 @@
     movd         eax, xm4
     RET
 
-
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code start
+;--------------------------------------------------------------------------------------
+%macro PROCESS_COPY_CNT_32x4_AVX512 0
+    movu        m0,                  [r1]
+    movu        m1,                  [r1 + r2]
+    movu        [r0],                m0
+    movu        [r0 + mmsize],       m1
+    packsswb    m0,                  m1
+    pminub      m0,                  m3
+
+    movu        m1,                  [r1 + 2 * r2]
+    movu        m2,                  [r1 + r3]
+    movu        [r0 + 2 * mmsize],   m1
+    movu        [r0 + 3 * mmsize],   m2
+    packsswb    m1,                  m2
+    pminub      m1,                  m3
+
+    paddb       m0,                  m1
+    paddb       m4,                  m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_16x4_AVX512 0
+    movu          ym0,               [r1]
+    vinserti32x8   m0,               [r1 + r2],    1
+    movu          ym1,               [r1 + 2 * r2]
+    vinserti32x8   m1,               [r1 + r3],    1
+    movu         [r0],               m0
+    movu         [r0 + mmsize],      m1
+    packsswb       m0,               m1
+    pminub         m0,               m3
+    paddb          m4,               m0
+%endmacro
+
+%macro PROCESS_COPY_CNT_END_AVX512 0
+    pxor           m0,  m0
+    vextracti32x8  ym1, m4, 1
+    paddb          ym4, ym1
+    vextracti32x4  xm1, ym4, 1
+    paddb          xm4, xm1
+    psadbw         xm4, xm0
+    movhlps        xm1, xm4
+    paddd          xm4, xm1
+    movd           eax, xm4
+%endmacro
+
+;--------------------------------------------------------------------------------------
+; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal copy_cnt_32, 3, 4, 5
+    add              r2d,  r2d
+    lea              r3,   [3 * r2]
+
+    vbroadcasti32x8  m3,   [pb_1]
+    pxor             m4,   m4
+
+%rep 7
+    PROCESS_COPY_CNT_32x4_AVX512
+    add              r0,  4 * mmsize
+    lea              r1,  [r1 + 4 * r2]
+%endrep
+    PROCESS_COPY_CNT_32x4_AVX512
+    PROCESS_COPY_CNT_END_AVX512
+    RET
+
+INIT_ZMM avx512
+cglobal copy_cnt_16, 3, 4, 5
+    add              r2d,  r2d
+    lea              r3,   [3 * r2]
+
+    vbroadcasti32x8  m3,   [pb_1]
+    pxor             m4,   m4
+
+%rep 3
+    PROCESS_COPY_CNT_16x4_AVX512
+    add              r0,  2 * mmsize
+    lea              r1,  [r1 + 4 * r2]
+%endrep
+    PROCESS_COPY_CNT_16x4_AVX512
+    PROCESS_COPY_CNT_END_AVX512
+    RET
+;--------------------------------------------------------------------------------------
+; copy_cnt avx512 code end
+;--------------------------------------------------------------------------------------
 ;--------------------------------------------------------------------------------------
 ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 ;--------------------------------------------------------------------------------------
diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Wed Aug 23 10:06:01 2017 +0530
+++ b/source/common/x86/blockcopy8.h	Thu Aug 24 12:20:07 2017 +0530
@@ -46,6 +46,7 @@
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride);
 FUNCDEF_TU_S(uint32_t, copy_cnt, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride);
 FUNCDEF_TU_S(uint32_t, copy_cnt, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride);
+FUNCDEF_TU_S(uint32_t, copy_cnt, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride);
 
 FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
 FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);


More information about the x265-devel mailing list