[x265] [PATCH 043 of 307] x86: AVX512 blockcopy_ss_64x64, blockcopy_pp_64xN, blockcopy_ps_64x64 and

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:30:41 CEST 2018


# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1500964333 -19800
#      Tue Jul 25 12:02:13 2017 +0530
# Node ID 2ad06d32a8465ce20e673c819b917a7524ecf8e9
# Parent  6b3b8ef0f37e0f7860f4f43c99e581674b19f9e3
x86: AVX512 blockcopy_ss_64x64, blockcopy_pp_64xN, blockcopy_ps_64x64 and
blockcopy_sp_64x64 for HIGH_BIT_DEPTH

HIGH_BIT_DEPTH:

Primitive      | AVX2 performance  | AVX512 performance
-------------------------------------------------------
copy_ss[64x64] |    1.38x          |    2.85x
copy_pp[64x64] |    1.91x          |    3.03x
copy_pp[64x48] |    1.90x          |    3.21x
copy_pp[64x32] |    1.99x          |    3.26x
copy_pp[64x16] |    2.01x          |    3.56x
copy_ps[64x64] |    1.78x          |    3.46x
copy_sp[64x64] |    1.80x          |    3.25x

diff -r 6b3b8ef0f37e -r 2ad06d32a846 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jul 21 14:55:49 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 25 12:02:13 2017 +0530
@@ -2191,10 +2191,20 @@
     if (cpuMask & X265_CPU_AVX512)
     {
         p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
         p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512);
         p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512);
+
+        // 64 X N
+        p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512);
+        p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512);
+        p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx512);
+        p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx512);
+        p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512);
+        p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512);
+        p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512);
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -3727,7 +3737,6 @@
         p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2);
         p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2);
 
-        p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512);
     }
     if (cpuMask & X265_CPU_AVX512)
     {


More information about the x265-devel mailing list