[x265] [PATCH] blockfill_s_16x16 avx2 asm code, performance improved 389.21 cycles -> 204.38 cycles

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Sep 29 10:47:45 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1411980445 -19800
# Node ID 9a8552ea378500baa21b89b24d8aec99acf7cce2
# Parent  32f50df7fa7672f4c1818ddf3165b4bd243e0b10
blockfill_s_16x16 avx2 asm code, performance improved 389.21 cycles -> 204.38 cycles

diff -r 32f50df7fa76 -r 9a8552ea3785 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon Sep 29 14:17:25 2014 +0530
@@ -1760,6 +1760,7 @@
         p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
         p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
 
+        p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_avx2;
 
         p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
         p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
diff -r 32f50df7fa76 -r 9a8552ea3785 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/common/x86/blockcopy8.asm	Mon Sep 29 14:17:25 2014 +0530
@@ -1826,6 +1826,38 @@
 
 BLOCKFILL_S_W16_H8 16, 16
 
+INIT_YMM avx2
+cglobal blockfill_s_16x16, 3, 4, 1
+add        r1, r1
+lea        r3, [3 * r1]
+
+movd       xm0, r2d
+pshuflw    xm0, xm0, 0
+pshufd     xm0, xm0, 0
+
+vinserti128 m0, m0, xm0, 1
+
+movu       [r0], m0
+movu       [r0 + r1], m0
+movu       [r0 + 2 * r1], m0
+movu       [r0 + r3], m0
+lea        r0, [r0 + 4 * r1]
+movu       [r0], m0
+movu       [r0 + r1], m0
+movu       [r0 + 2 * r1], m0
+movu       [r0 + r3], m0
+lea        r0, [r0 + 4 * r1]
+movu       [r0], m0
+movu       [r0 + r1], m0
+movu       [r0 + 2 * r1], m0
+movu       [r0 + r3], m0
+lea        r0, [r0 + 4 * r1]
+movu       [r0], m0
+movu       [r0 + r1], m0
+movu       [r0 + 2 * r1], m0
+movu       [r0 + r3], m0
+RET
+
 ;-----------------------------------------------------------------------------
 ; void blockfill_s_%1x%2(int16_t *dest, intptr_t destride, int16_t val)
 ;-----------------------------------------------------------------------------
diff -r 32f50df7fa76 -r 9a8552ea3785 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Fri Sep 26 17:33:09 2014 -0500
+++ b/source/common/x86/blockcopy8.h	Mon Sep 29 14:17:25 2014 +0530
@@ -201,6 +201,8 @@
 void x265_blockcopy_pp_32x48_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
 void x265_blockcopy_pp_32x64_avx(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb);
 
+void x265_blockfill_s_16x16_avx2(int16_t *dst, intptr_t dstride, int16_t val);
+
 #undef BLOCKCOPY_COMMON
 #undef BLOCKCOPY_SS_PP
 #undef BLOCKCOPY_SP


More information about the x265-devel mailing list