[x265] [PATCH 114 of 307] x86: Aligned routine implementation for blockfill_s primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:52 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507112703 -19800
# Wed Oct 04 15:55:03 2017 +0530
# Node ID 14c93ddbd598128b43a96ff21221e2dbb189d275
# Parent ddc227597df3335e30cec9a50489f3fd87391274
x86: Aligned routine implementation for blockfill_s primitive
diff -r ddc227597df3 -r 14c93ddbd598 source/common/pixel.cpp
--- a/source/common/pixel.cpp Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/pixel.cpp Wed Oct 04 15:55:03 2017 +0530
@@ -1000,6 +1000,7 @@
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_c<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].blockfill_s = blockfill_s_c<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s_aligned = blockfill_s_c<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shr = cpy2Dto1D_shr<W>; \
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl = cpy1Dto2D_shl<W>; \
diff -r ddc227597df3 -r 14c93ddbd598 source/common/primitives.h
--- a/source/common/primitives.h Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/primitives.h Wed Oct 04 15:55:03 2017 +0530
@@ -271,6 +271,7 @@
pixel_sub_ps_t sub_ps;
pixel_add_ps_t add_ps;
blockfill_s_t blockfill_s; // block fill, for DC transforms
+ blockfill_s_t blockfill_s_aligned; // block fill, for DC transforms
copy_cnt_t copy_cnt; // copy coeff while counting non-zero
count_nonzero_t count_nonzero;
cpy2Dto1D_shl_t cpy2Dto1D_shl;
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 15:55:03 2017 +0530
@@ -2569,6 +2569,9 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg_aligned = PFX(addAvg_aligned_32x48_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512);
+ p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2);
+ p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512);
+
}
}
#else // if HIGH_BIT_DEPTH
@@ -4294,6 +4297,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512);
p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512);
+ p.cu[BLOCK_16x16].blockfill_s_aligned = PFX(blockfill_s_16x16_avx2);
+ p.cu[BLOCK_32x32].blockfill_s_aligned = PFX(blockfill_s_aligned_32x32_avx512);
p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512);
p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512);
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Oct 04 15:55:03 2017 +0530
@@ -2574,6 +2574,24 @@
%endrep
RET
+;--------------------------------------------------------------------
+; void blockfill_s_aligned_32x32(int16_t* dst, intptr_t dstride, int16_t val)
+;--------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal blockfill_s_aligned_32x32, 3, 4, 1
+add r1, r1
+lea r3, [3 * r1]
+movd xm0, r2d
+vpbroadcastw m0, xm0
+
+%rep 8
+mova [r0], m0
+mova [r0 + r1], m0
+mova [r0 + 2 * r1], m0
+mova [r0 + r3], m0
+lea r0, [r0 + 4 * r1]
+%endrep
+RET
;-----------------------------------------------------------------------------
; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride);
;-----------------------------------------------------------------------------
diff -r ddc227597df3 -r 14c93ddbd598 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Wed Oct 04 14:03:32 2017 +0530
+++ b/source/common/x86/blockcopy8.h Wed Oct 04 15:55:03 2017 +0530
@@ -51,6 +51,7 @@
FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val);
+FUNCDEF_TU(void, blockfill_s_aligned, avx512, int16_t* dst, intptr_t dstride, int16_t val);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride);
diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Oct 04 14:03:32 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Oct 04 15:55:03 2017 +0530
@@ -645,8 +645,33 @@
bool PixelHarness::check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt)
{
- ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
- ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ intptr_t stride = 64;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int16_t value = (rand() % SHORT_MAX) + 1;
+
+ checked(opt, opt_dest, stride, value);
+ ref(ref_dest, stride, value);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt)
+{
+ ALIGN_VAR_64(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_64(int16_t, opt_dest[64 * 64]);
memset(ref_dest, 0xCD, sizeof(ref_dest));
memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -2388,6 +2413,14 @@
}
}
+ if (opt.cu[i].blockfill_s_aligned)
+ {
+ if (!check_blockfill_s_aligned(ref.cu[i].blockfill_s_aligned, opt.cu[i].blockfill_s_aligned))
+ {
+ printf("blockfill_s_aligned[%dx%d]: failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
if (opt.cu[i].var)
{
if (!check_pixel_var(ref.cu[i].var, opt.cu[i].var))
@@ -3081,6 +3114,12 @@
REPORT_SPEEDUP(opt.cu[i].blockfill_s, ref.cu[i].blockfill_s, sbuf1, 64, SHORT_MAX);
}
+ if (opt.cu[i].blockfill_s_aligned)
+ {
+ HEADER("blkfill[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.cu[i].blockfill_s_aligned, ref.cu[i].blockfill_s_aligned, sbuf1, 64, SHORT_MAX);
+ }
+
if (opt.cu[i].transpose)
{
HEADER("transpose[%dx%d]", 4 << i, 4 << i);
diff -r ddc227597df3 -r 14c93ddbd598 source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Oct 04 14:03:32 2017 +0530
+++ b/source/test/pixelharness.h Wed Oct 04 15:55:03 2017 +0530
@@ -85,6 +85,7 @@
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
+ bool check_blockfill_s_aligned(blockfill_s_t ref, blockfill_s_t opt);
bool check_calresidual(calcresidual_t ref, calcresidual_t opt);
bool check_transpose(transpose_t ref, transpose_t opt);
bool check_weightp(weightp_pp_t ref, weightp_pp_t opt);
More information about the x265-devel
mailing list