[x265] [PATCH] blockfill_s_16x16 avx2 asm code: performance improved from 389.21 cycles to 204.38 cycles, over sse version of asm code

chen chenm003 at 163.com
Tue Sep 30 17:57:45 CEST 2014


right now.

At 2014-09-30 13:40:37,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1412054832 -19800
># Node ID 975078c41433328fa35913ad46a95cd9c78a8bb2
># Parent  5a6845566d1492d29af29ecc0cf75d644994735c
>blockfill_s_16x16 avx2 asm code: performance improved from 389.21 cycles to 204.38 cycles, over sse version of asm code
>
>diff -r 5a6845566d14 -r 975078c41433 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Mon Sep 29 17:37:47 2014 -0500
>+++ b/source/common/x86/asm-primitives.cpp	Tue Sep 30 10:57:12 2014 +0530
>@@ -1760,6 +1760,7 @@
>         p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
>         p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
> 
>+        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
> 
>         p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
>         p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
>diff -r 5a6845566d14 -r 975078c41433 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm	Mon Sep 29 17:37:47 2014 -0500
>+++ b/source/common/x86/blockcopy8.asm	Tue Sep 30 10:57:12 2014 +0530
>@@ -1826,6 +1826,34 @@
> 
> BLOCKFILL_S_W16_H8 16, 16
> 
>+INIT_YMM avx2
>+cglobal blockfill_s_16x16, 3, 4, 1
>+add          r1, r1
>+lea          r3, [3 * r1]
>+movd         xm0, r2d
>+vpbroadcastw m0, xm0
>+
>+movu       [r0], m0
>+movu       [r0 + r1], m0
>+movu       [r0 + 2 * r1], m0
>+movu       [r0 + r3], m0
>+lea        r0, [r0 + 4 * r1]
>+movu       [r0], m0
>+movu       [r0 + r1], m0
>+movu       [r0 + 2 * r1], m0
>+movu       [r0 + r3], m0
>+lea        r0, [r0 + 4 * r1]
>+movu       [r0], m0
>+movu       [r0 + r1], m0
>+movu       [r0 + 2 * r1], m0
>+movu       [r0 + r3], m0
>+lea        r0, [r0 + 4 * r1]
>+movu       [r0], m0
>+movu       [r0 + r1], m0
>+movu       [r0 + 2 * r1], m0
>+movu       [r0 + r3], m0
>+RET
>+
> ;-----------------------------------------------------------------------------
> ; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
> ;-----------------------------------------------------------------------------
>diff -r 5a6845566d14 -r 975078c41433 source/common/x86/blockcopy8.h
>--- a/source/common/x86/blockcopy8.h	Mon Sep 29 17:37:47 2014 -0500
>+++ b/source/common/x86/blockcopy8.h	Tue Sep 30 10:57:12 2014 +0530
>@@ -201,6 +201,8 @@
> void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
> void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
> 
>+void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
>+
> #undef BLOCKCOPY_COMMON
> #undef BLOCKCOPY_SS_PP
> #undef BLOCKCOPY_SP
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140930/511af275/attachment.html>


More information about the x265-devel mailing list