[x265] [PATCH 118 of 307] x86: Aligned routine implementation for scale1D_128to64 primitive
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:31:56 CEST 2018
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1507116779 -19800
# Wed Oct 04 17:02:59 2017 +0530
# Node ID ba20a08181382a2fb18a0d1aff7637d66fa41ac7
# Parent 1748c9a5c9b16c380f926cd5d07a69c4f13a6fab
x86: Aligned routine implementation for scale1D_128to64 primitive
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/pixel.cpp Wed Oct 04 17:02:59 2017 +0530
@@ -1264,6 +1264,7 @@
p.weight_sp = weight_sp_c;
p.scale1D_128to64 = scale1D_128to64;
+ p.scale1D_128to64_aligned = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
p.frameInitLowres = frame_init_lowres_core;
p.ssim_4x4x2_core = ssim_4x4x2_core;
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/primitives.h
--- a/source/common/primitives.h Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/primitives.h Wed Oct 04 17:02:59 2017 +0530
@@ -312,6 +312,7 @@
dequant_normal_t dequant_normal;
denoiseDct_t denoiseDct;
scale1D_t scale1D_128to64;
+ scale1D_t scale1D_128to64_aligned;
scale2D_t scale2D_64to32;
ssim_4x4x2_core_t ssim_4x4x2_core;
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Oct 04 17:02:59 2017 +0530
@@ -4219,6 +4219,7 @@
p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512);
p.scale1D_128to64 = PFX(scale1D_128to64_avx512);
+ p.scale1D_128to64_aligned = PFX(scale1D_128to64_aligned_avx512);
p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512);
p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512);
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/pixel-util.h Wed Oct 04 17:02:59 2017 +0530
@@ -37,6 +37,7 @@
void PFX(weight_pp_ ## cpu(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(weight_sp_ ## cpu(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)); \
void PFX(scale1D_128to64_ ## cpu(pixel*, const pixel*)); \
+ void PFX(scale1D_128to64_aligned_ ## cpu(pixel*, const pixel*)); \
void PFX(scale2D_64to32_ ## cpu(pixel*, const pixel*, intptr_t)); \
uint32_t PFX(costCoeffRemain_ ## cpu(uint16_t *absCoeff, int numNonZero, int idx)); \
uint32_t PFX(costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset)); \
diff -r 1748c9a5c9b1 -r ba20a0818138 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Oct 05 11:26:37 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Oct 04 17:02:59 2017 +0530
@@ -4599,6 +4599,37 @@
vshufi64x2 m2, m2, q3120
movu [r0 + 64], m2
RET
+
+INIT_ZMM avx512
+cglobal scale1D_128to64_aligned, 2, 2, 6
+ pxor m4, m4
+ vbroadcasti32x8 m5, [pb_1]
+
+ ;Top pixel
+ mova m0, [r1]
+ mova m1, [r1 + 64]
+ mova m2, [r1 + 128]
+ mova m3, [r1 + 192]
+
+ pmaddubsw m0, m0, m5
+ pavgw m0, m4
+ pmaddubsw m1, m1, m5
+ pavgw m1, m4
+ packuswb m0, m1
+ vpermq m0, m0, q3120
+ vshufi64x2 m0, m0, q3120
+ mova [r0], m0
+
+ ;Left pixel
+ pmaddubsw m2, m2, m5
+ pavgw m2, m4
+ pmaddubsw m3, m3, m5
+ pavgw m3, m4
+ packuswb m2, m3
+ vpermq m2, m2, q3120
+ vshufi64x2 m2, m2, q3120
+ mova [r0 + 64], m2
+ RET
%endif
;-----------------------------------------------------------------
diff -r 1748c9a5c9b1 -r ba20a0818138 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Oct 05 11:26:37 2017 +0530
+++ b/source/test/pixelharness.cpp Wed Oct 04 17:02:59 2017 +0530
@@ -749,8 +749,8 @@
bool PixelHarness::check_scale1D_pp(scale1D_t ref, scale1D_t opt)
{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
memset(ref_dest, 0, sizeof(ref_dest));
memset(opt_dest, 0, sizeof(opt_dest));
@@ -772,6 +772,31 @@
return true;
}
+bool PixelHarness::check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt)
+{
+ ALIGN_VAR_64(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_64(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0, sizeof(ref_dest));
+ memset(opt_dest, 0, sizeof(opt_dest));
+
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, pixel_test_buff[index] + j);
+ ref(ref_dest, pixel_test_buff[index] + j);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR * 2;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_scale2D_pp(scale2D_t ref, scale2D_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -2587,6 +2612,15 @@
}
}
+ if (opt.scale1D_128to64_aligned)
+ {
+ if (!check_scale1D_pp_aligned(ref.scale1D_128to64_aligned, opt.scale1D_128to64_aligned))
+ {
+ printf("scale1D_128to64_aligned failed!\n");
+ return false;
+ }
+ }
+
if (opt.scale2D_64to32)
{
if (!check_scale2D_pp(ref.scale2D_64to32, opt.scale2D_64to32))
@@ -3234,6 +3268,12 @@
REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1);
}
+ if (opt.scale1D_128to64_aligned)
+ {
+ HEADER0("scale1D_128to64_aligned");
+ REPORT_SPEEDUP(opt.scale1D_128to64_aligned, ref.scale1D_128to64_aligned, pbuf2, pbuf1);
+ }
+
if (opt.scale2D_64to32)
{
HEADER0("scale2D_64to32");
diff -r 1748c9a5c9b1 -r ba20a0818138 source/test/pixelharness.h
--- a/source/test/pixelharness.h Thu Oct 05 11:26:37 2017 +0530
+++ b/source/test/pixelharness.h Wed Oct 04 17:02:59 2017 +0530
@@ -82,6 +82,7 @@
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
bool check_scale1D_pp(scale1D_t ref, scale1D_t opt);
+ bool check_scale1D_pp_aligned(scale1D_t ref, scale1D_t opt);
bool check_scale2D_pp(scale2D_t ref, scale2D_t opt);
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
More information about the x265-devel
mailing list