[x265] [PATCH 044 of 307] x86: AVX512 blockcopy_ss_32x32, blockcopy_pp_32xN, blockcopy_ps_32x32 and

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:42 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1500964406 -19800
#      Tue Jul 25 12:03:26 2017 +0530
# Node ID 4978d583e2e82aec1f09d94ecdf52191eac7ceb5
# Parent  2ad06d32a8465ce20e673c819b917a7524ecf8e9
x86: AVX512 blockcopy_ss_32x32, blockcopy_pp_32xN, blockcopy_ps_32x32 and
blockcopy_sp_32x32 for HIGH_BIT_DEPTH

HIGH_BIT_DEPTH:

Primitive      | AVX2 performance  | AVX512 performance
-------------------------------------------------------
copy_ss[32x32] |    1.83x          |    3.91x
copy_pp[32x64] |    1.98x          |    3.22x
copy_pp[32x32] |    1.91x          |    4.11x
copy_pp[32x24] |    2.06x          |    4.20x
copy_pp[32x16] |    1.80x          |    4.60x
copy_ps[32x32] |    1.90x          |    4.60x
copy_sp[32x32] |    2.02x          |    4.78x

diff -r 2ad06d32a846 -r 4978d583e2e8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 25 12:02:13 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 12:03:26 2017 +0530
@@ -2205,6 +2205,31 @@
         p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512);
         p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512);
         p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512);
+
+        // 32 X N
+        p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
+        p.pu[LUMA_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
+        p.pu[LUMA_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
+        p.pu[LUMA_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
+        p.pu[LUMA_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
+        p.pu[LUMA_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512);
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x48_avx512);
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512);
+        p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x64_avx512);
+        p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512);
+
     }
 }
 #else // if HIGH_BIT_DEPTH


More information about the x265-devel mailing list