[x265] [PATCH 037 of 307] x86: AVX512 blockcopy_ss_32xN

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:35 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1499171579 -19800
#      Tue Jul 04 18:02:59 2017 +0530
# Node ID ef8989f43083cd5195ff3ba360959fe3900399e5
# Parent  3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392
x86: AVX512 blockcopy_ss_32xN

AVX2 performance over C code   : 1.82x
AVX512 performance over C code : 4.56x

diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 04 15:23:31 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 04 18:02:59 2017 +0530
@@ -3854,6 +3854,9 @@
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512);
 
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
         p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
 
     }
diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Tue Jul 04 15:23:31 2017 +0530
+++ b/source/common/x86/blockcopy8.asm	Tue Jul 04 18:02:59 2017 +0530
@@ -4164,6 +4164,143 @@
 BLOCKCOPY_SS_W32_H4_avx 32, 48
 BLOCKCOPY_SS_W32_H4_avx 32, 64
 
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r6]
+    lea     r2, [r2 + 4 * r3]
+
+    movu    [r0],          m0
+    movu    [r0 + r1],     m1
+    movu    [r0 + 2 * r1], m2
+    movu    [r0 + r5],     m3
+    lea     r0, [r0 + 4 * r1]
+
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r6]
+    lea     r2, [r2 + 4 * r3]
+
+    movu    [r0],          m0
+    movu    [r0 + r1],     m1
+    movu    [r0 + 2 * r1], m2
+    movu    [r0 + r5],     m3
+    lea     r0, [r0 + 4 * r1]
+%endmacro
+
+%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r6]
+    lea     r2, [r2 + 4 * r3]
+
+    movu    [r0],          m0
+    movu    [r0 + r1],     m1
+    movu    [r0 + 2 * r1], m2
+    movu    [r0 + r5],     m3
+    lea     r0, [r0 + 4 * r1]
+
+    movu    m0, [r2]
+    movu    m1, [r2 + r3]
+    movu    m2, [r2 + 2 * r3]
+    movu    m3, [r2 + r6]
+
+    movu    [r0],          m0
+    movu    [r0 + r1],     m1
+    movu    [r0 + 2 * r1], m2
+    movu    [r0 + r5],     m3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x8, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x16, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x24, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x32, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x48, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
+INIT_ZMM avx512
+cglobal blockcopy_ss_32x64, 4, 7, 4
+
+    add    r1, r1
+    add    r3, r3
+    lea    r5, [3 * r1]
+    lea    r6, [3 * r3]
+
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_avx512
+    PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512
+    RET
+
 ;-----------------------------------------------------------------------------
 ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list