[x265] [PATCH] blockfill_s_32x32 avx2 asm code, performance improved 1354.05 cycles -> 705.81 cycles

Steve Borho steve at borho.org
Mon Sep 29 19:52:18 CEST 2014


On 09/29, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1411982102 -19800
> # Node ID 543a79749978dc4aae2956788bb16e50d2ceca14
> # Parent  9a8552ea378500baa21b89b24d8aec99acf7cce2
> blockfill_s_32x32 avx2 asm code, performance improved 1354.05 cycles -> 705.81 cycles

This doesn't apply on default

> diff -r 9a8552ea3785 -r 543a79749978 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Mon Sep 29 14:17:25 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Mon Sep 29 14:45:02 2014 +0530
> @@ -1761,6 +1761,7 @@
>          p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
>  
>          p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
> +        p.blockfill_s[BLOCK_32x32] = x265_blockfill_s_32x32_avx2;
>  
>          p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> diff -r 9a8552ea3785 -r 543a79749978 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm	Mon Sep 29 14:17:25 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm	Mon Sep 29 14:45:02 2014 +0530
> @@ -1906,6 +1906,89 @@
>  
>  BLOCKFILL_S_W32_H4 32, 32
>  
> +INIT_YMM avx2
> +cglobal blockfill_s_32x32, 3, 4, 1
> +add        r1, r1
> +lea        r3, [3 * r1]
> +
> +movd       xm0, r2d
> +pshuflw    xm0, xm0, 0
> +pshufd     xm0, xm0, 0
> +
> +vinserti128 m0, m0, xm0, 1
> +
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +lea        r0, [r0 + 4 * r1]
> +movu       [r0], m0
> +movu       [r0 + 32], m0
> +movu       [r0 + r1], m0
> +movu       [r0 + r1 + 32], m0
> +movu       [r0 + 2 * r1], m0
> +movu       [r0 + 2 * r1 + 32], m0
> +movu       [r0 + r3], m0
> +movu       [r0 + r3 + 32], m0
> +RET
>  
>  
>  ;-----------------------------------------------------------------------------
> diff -r 9a8552ea3785 -r 543a79749978 source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h	Mon Sep 29 14:17:25 2014 +0530
> +++ b/source/common/x86/blockcopy8.h	Mon Sep 29 14:45:02 2014 +0530
> @@ -202,6 +202,7 @@
>  void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
>  
>  void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
> +void x265_blockfill_s_32x32_avx2(int16_t *dst, intptr_t dstride, int16_t val);
>  
>  #undef BLOCKCOPY_COMMON
>  #undef BLOCKCOPY_SS_PP
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list