[x265] [PATCH 03 of 29] scale1D_128to64_new primitive: ASM code and unit test code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:11 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1421044413 -19800
# Mon Jan 12 12:03:33 2015 +0530
# Node ID f3fd0d075bba2320ed8cc93df055b144b5a1b88e
# Parent cf29bf7824491d35e20df5249810ff9a1520d3e3
scale1D_128to64_new primitive: ASM code and unit test code
diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 12 12:03:33 2015 +0530
@@ -1385,6 +1385,7 @@
if (cpuMask & X265_CPU_SSSE3)
{
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale1D_128to64_new = x265_scale1D_128to64_new_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
INTRA_ANG_SSSE3(ssse3);
@@ -1446,6 +1447,7 @@
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.scale1D_128to64_new = x265_scale1D_128to64_new_avx2;
p.cu[BLOCK_4x4].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_4_avx2;
p.cu[BLOCK_8x8].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_8_avx2;
p.cu[BLOCK_16x16].cpy1Dto2D_shl = x265_cpy1Dto2D_shl_16_avx2;
@@ -1603,6 +1605,7 @@
INTRA_ANG_SSSE3(ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
+ p.scale1D_128to64_new = x265_scale1D_128to64_new_ssse3;
p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
SAD_X3(ssse3);
SAD_X4(ssse3);
@@ -1813,6 +1816,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA422_16x32].copy_ss = x265_blockcopy_ss_16x32_avx;
p.chroma[X265_CSP_I422].pu[CHROMA422_16x64].copy_ss = x265_blockcopy_ss_16x64_avx;
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+ p.scale1D_128to64_new = x265_scale1D_128to64_new_avx2;
p.weight_pp = x265_weight_pp_avx2;
diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/pixel-util.h Mon Jan 12 12:03:33 2015 +0530
@@ -65,6 +65,8 @@
void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_new_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale1D_128to64_new_avx2(pixel*, const pixel*, intptr_t);
void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r cf29bf782449 -r f3fd0d075bba source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jan 12 20:37:35 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Jan 12 12:03:33 2015 +0530
@@ -2892,6 +2892,335 @@
INIT_XMM avx
SSIM
+%macro SCALE1D_128to64_HBD 0
+ movu m0, [r1]
+ palignr m1, m0, 2
+ movu m2, [r1 + 16]
+ palignr m3, m2, 2
+ movu m4, [r1 + 32]
+ palignr m5, m4, 2
+ movu m6, [r1 + 48]
+ pavgw m0, m1
+ palignr m1, m6, 2
+ pavgw m2, m3
+ pavgw m4, m5
+ pavgw m6, m1
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+ punpcklqdq m0, m2
+ movu [r0], m0
+ punpcklqdq m4, m6
+ movu [r0 + 16], m4
+
+ movu m0, [r1 + 64]
+ palignr m1, m0, 2
+ movu m2, [r1 + 80]
+ palignr m3, m2, 2
+ movu m4, [r1 + 96]
+ palignr m5, m4, 2
+ movu m6, [r1 + 112]
+ pavgw m0, m1
+ palignr m1, m6, 2
+ pavgw m2, m3
+ pavgw m4, m5
+ pavgw m6, m1
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+ punpcklqdq m0, m2
+ movu [r0 + 32], m0
+ punpcklqdq m4, m6
+ movu [r0 + 48], m4
+
+ movu m0, [r1 + 128]
+ palignr m1, m0, 2
+ movu m2, [r1 + 144]
+ palignr m3, m2, 2
+ movu m4, [r1 + 160]
+ palignr m5, m4, 2
+ movu m6, [r1 + 176]
+ pavgw m0, m1
+ palignr m1, m6, 2
+ pavgw m2, m3
+ pavgw m4, m5
+ pavgw m6, m1
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 64], m0
+ punpcklqdq m4, m6
+ movu [r0 + 80], m4
+
+ movu m0, [r1 + 192]
+ palignr m1, m0, 2
+ movu m2, [r1 + 208]
+ palignr m3, m2, 2
+ movu m4, [r1 + 224]
+ palignr m5, m4, 2
+ movu m6, [r1 + 240]
+ pavgw m0, m1
+ palignr m1, m6, 2
+ pavgw m2, m3
+ pavgw m4, m5
+ pavgw m6, m1
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 96], m0
+ punpcklqdq m4, m6
+ movu [r0 + 112], m4
+%endmacro
+
+;-----------------------------------------------------------------
+; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
+;-----------------------------------------------------------------
+INIT_XMM ssse3
+cglobal scale1D_128to64_new, 2, 2, 8, dest, src1, stride
+%if HIGH_BIT_DEPTH
+ mova m7, [deinterleave_word_shuf]
+
+ ;Top pixel
+ SCALE1D_128to64_HBD
+
+ ;Left pixel
+ add r1, 256
+ add r0, 128
+ SCALE1D_128to64_HBD
+
+%else
+ mova m7, [deinterleave_shuf]
+
+ ;Top pixel
+ movu m0, [r1]
+ palignr m1, m0, 1
+ movu m2, [r1 + 16]
+ palignr m3, m2, 1
+ movu m4, [r1 + 32]
+ palignr m5, m4, 1
+ movu m6, [r1 + 48]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0], m0
+ punpcklqdq m4, m6
+ movu [r0 + 16], m4
+
+ movu m0, [r1 + 64]
+ palignr m1, m0, 1
+ movu m2, [r1 + 80]
+ palignr m3, m2, 1
+ movu m4, [r1 + 96]
+ palignr m5, m4, 1
+ movu m6, [r1 + 112]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 32], m0
+ punpcklqdq m4, m6
+ movu [r0 + 48], m4
+
+ ;Left pixel
+ movu m0, [r1 + 128]
+ palignr m1, m0, 1
+ movu m2, [r1 + 144]
+ palignr m3, m2, 1
+ movu m4, [r1 + 160]
+ palignr m5, m4, 1
+ movu m6, [r1 + 176]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 64], m0
+ punpcklqdq m4, m6
+ movu [r0 + 80], m4
+
+ movu m0, [r1 + 192]
+ palignr m1, m0, 1
+ movu m2, [r1 + 208]
+ palignr m3, m2, 1
+ movu m4, [r1 + 224]
+ palignr m5, m4, 1
+ movu m6, [r1 + 240]
+
+ pavgb m0, m1
+
+ palignr m1, m6, 1
+
+ pavgb m2, m3
+ pavgb m4, m5
+ pavgb m6, m1
+
+ pshufb m0, m0, m7
+ pshufb m2, m2, m7
+ pshufb m4, m4, m7
+ pshufb m6, m6, m7
+
+ punpcklqdq m0, m2
+ movu [r0 + 96], m0
+ punpcklqdq m4, m6
+ movu [r0 + 112], m4
+%endif
+RET
+
+%if HIGH_BIT_DEPTH == 1
+INIT_YMM avx2
+cglobal scale1D_128to64_new, 2, 2, 3
+ pxor m2, m2
+
+ ;Top pixel
+ movu m0, [r1]
+ movu m1, [r1 + 32]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0], m0
+
+ movu m0, [r1 + 64]
+ movu m1, [r1 + 96]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 32], m0
+
+ movu m0, [r1 + 128]
+ movu m1, [r1 + 160]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 64], m0
+
+ movu m0, [r1 + 192]
+ movu m1, [r1 + 224]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 96], m0
+
+ ;Left pixel
+ movu m0, [r1 + 256]
+ movu m1, [r1 + 288]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 128], m0
+
+ movu m0, [r1 + 320]
+ movu m1, [r1 + 352]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 160], m0
+
+ movu m0, [r1 + 384]
+ movu m1, [r1 + 416]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 192], m0
+
+ movu m0, [r1 + 448]
+ movu m1, [r1 + 480]
+ phaddw m0, m1
+ pavgw m0, m2
+ vpermq m0, m0, 0xD8
+ movu [r0 + 224], m0
+
+ RET
+%else ; HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+cglobal scale1D_128to64_new, 2, 2, 4
+ pxor m2, m2
+ mova m3, [pb_1]
+
+ ;Top pixel
+ movu m0, [r1]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 32]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0], m0
+
+ movu m0, [r1 + 64]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 96]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0 + 32], m0
+
+ ;Left pixel
+ movu m0, [r1 + 128]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 160]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0 + 64], m0
+
+ movu m0, [r1 + 192]
+ pmaddubsw m0, m0, m3
+ pavgw m0, m2
+ movu m1, [r1 + 224]
+ pmaddubsw m1, m1, m3
+ pavgw m1, m2
+ packuswb m0, m1
+ vpermq m0, m0, 0xD8
+ movu [r0 + 96], m0
+ RET
+%endif
+
;-----------------------------------------------------------------
; void scale1D_128to64(pixel *dst, pixel *src, intptr_t /*stride*/)
;-----------------------------------------------------------------
diff -r cf29bf782449 -r f3fd0d075bba source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jan 12 20:37:35 2015 +0530
+++ b/source/test/pixelharness.cpp Mon Jan 12 12:03:33 2015 +0530
@@ -708,6 +708,33 @@
return true;
}
+bool PixelHarness::check_scale_pp_new(scale_t ref, scale_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0, sizeof(ref_dest));
+ memset(opt_dest, 0, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, pixel_test_buff[index] + j, stride);
+ ref(ref_dest, pixel_test_buff[index] + j, stride);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
+
bool PixelHarness::check_transpose(transpose_t ref, transpose_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1539,6 +1566,15 @@
}
}
+ if (opt.scale1D_128to64_new)
+ {
+ if (!check_scale_pp_new(ref.scale1D_128to64_new, opt.scale1D_128to64_new))
+ {
+ printf("scale1D_128to64_new failed!\n");
+ return false;
+ }
+ }
+
if (opt.scale2D_64to32)
{
if (!check_scale_pp(ref.scale2D_64to32, opt.scale2D_64to32))
@@ -1925,6 +1961,12 @@
REPORT_SPEEDUP(opt.scale1D_128to64, ref.scale1D_128to64, pbuf2, pbuf1, 64);
}
+ if (opt.scale1D_128to64_new)
+ {
+ HEADER0("scale1D_128to64_new");
+ REPORT_SPEEDUP(opt.scale1D_128to64_new, ref.scale1D_128to64_new, pbuf2, pbuf1, 64);
+ }
+
if (opt.scale2D_64to32)
{
HEADER0("scale2D_64to32");
diff -r cf29bf782449 -r f3fd0d075bba source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jan 12 20:37:35 2015 +0530
+++ b/source/test/pixelharness.h Mon Jan 12 12:03:33 2015 +0530
@@ -76,6 +76,7 @@
bool check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt);
bool check_pixel_sub_ps(pixel_sub_ps_t ref, pixel_sub_ps_t opt);
bool check_pixel_add_ps(pixel_add_ps_t ref, pixel_add_ps_t opt);
+ bool check_scale_pp_new(scale_t ref, scale_t opt);
bool check_scale_pp(scale_t ref, scale_t opt);
bool check_ssd_s(pixel_ssd_s_t ref, pixel_ssd_s_t opt);
bool check_blockfill_s(blockfill_s_t ref, blockfill_s_t opt);
More information about the x265-devel
mailing list