[x265] [PATCH] added copy_shl primitive
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 2 16:13:31 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1409660231 -19800
# Node ID 61f7c056cd6e01e5a24a51b40c20c53bf4593ec7
# Parent 2667a0e3afdc2b95ff73c962b3e25366162d8e8d
added copy_shl primitive
diff -r 2667a0e3afdc -r 61f7c056cd6e source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Sep 02 15:31:10 2014 +0530
+++ b/source/common/pixel.cpp Tue Sep 02 17:47:11 2014 +0530
@@ -501,6 +501,21 @@
}
}
+template<int size>
+void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+{
+ for (int i = 0; i < size; i++)
+ {
+ for (int j = 0; j < size; j++)
+ {
+ dst[j] = (src[j] << shift);
+ }
+
+ src += size;
+ dst += stride;
+ }
+}
+
template<int blockSize>
void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
{
@@ -1230,6 +1245,10 @@
p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
p.copy_shr = copy_shr;
+ p.copy_shl[BLOCK_4x4] = copy_shl<4>;
+ p.copy_shl[BLOCK_8x8] = copy_shl<8>;
+ p.copy_shl[BLOCK_16x16] = copy_shl<16>;
+ p.copy_shl[BLOCK_32x32] = copy_shl<32>;
p.sa8d[BLOCK_4x4] = satd_4x4;
p.sa8d[BLOCK_8x8] = sa8d_8x8;
diff -r 2667a0e3afdc -r 61f7c056cd6e source/common/primitives.h
--- a/source/common/primitives.h Tue Sep 02 15:31:10 2014 +0530
+++ b/source/common/primitives.h Tue Sep 02 17:47:11 2014 +0530
@@ -155,6 +155,7 @@
typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
+typedef void (*copy_shl_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift);
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
@@ -229,6 +230,7 @@
cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
copy_shr_t copy_shr;
+ copy_shl_t copy_shl[NUM_SQUARE_BLOCKS - 1];
copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r 2667a0e3afdc -r 61f7c056cd6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 02 15:31:10 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Sep 02 17:47:11 2014 +0530
@@ -1550,6 +1550,10 @@
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
p.denoiseDct = x265_denoise_dct_sse2;
+ p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
+ p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
+ p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
+ p.copy_shl[BLOCK_32x32] = x265_copy_shl_32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 2667a0e3afdc -r 61f7c056cd6e source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Sep 02 15:31:10 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Sep 02 17:47:11 2014 +0530
@@ -4476,3 +4476,152 @@
jg .loop_row
RET
+
+;--------------------------------------------------------------------------------------
+; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal copy_shl_4, 3,3,3
+ add r2d, r2d
+ movd m0, r3m
+
+ ; Row 0-3
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ movh [r0], m1
+ movhps [r0 + r2], m1
+ movh [r0 + r2 * 2], m2
+ lea r2, [r2 * 3]
+ movhps [r0 + r2], m2
+ RET
+
+;--------------------------------------------------------------------------------------
+; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal copy_shl_8, 3,4,5
+ add r2d, r2d
+ movd m0, r3m
+
+ ; Row 0-3
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0], m1
+ movu [r0 + r2], m2
+ movu [r0 + 2 * r2], m3
+ lea r0, [r0 + 2 * r2]
+ movu [r0 + r2], m4
+
+ ; Row 4-7
+ movu m1, [r1 + 4 * mmsize]
+ movu m2, [r1 + 5 * mmsize]
+ movu m3, [r1 + 6 * mmsize]
+ movu m4, [r1 + 7 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0 + r2 * 2], m1
+ lea r0, [r0 + 2 * r2]
+ movu [r0 + r2], m2
+ movu [r0 + 2 * r2], m3
+ lea r0, [r0 + 2 * r2]
+ movu [r0 + r2], m4
+ RET
+
+;--------------------------------------------------------------------------------------
+; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal copy_shl_16, 3,4,5
+ add r2d, r2d
+ movd m0, r3m
+ mov r3d, 256/64
+
+.loop:
+ ; Row 0-3
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0], m1
+ movu [r0 + 16], m2
+ movu [r0 + r2], m3
+ movu [r0 + r2 + 16], m4
+
+ ; Row 4-7
+ movu m1, [r1 + 4 * mmsize]
+ movu m2, [r1 + 5 * mmsize]
+ movu m3, [r1 + 6 * mmsize]
+ movu m4, [r1 + 7 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0 + r2 * 2], m1
+ movu [r0 + r2 * 2 + 16], m2
+ lea r0, [r0 + r2 * 2]
+ movu [r0 + r2], m3
+ movu [r0 + r2 + 16], m4
+
+ add r1, 8 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
+
+;--------------------------------------------------------------------------------------
+; void copy_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift)
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal copy_shl_32, 3,4,5
+ add r2d, r2d
+ movd m0, r3m
+ mov r3d, 1024/64
+
+.loop:
+ ; Row 0-3
+ movu m1, [r1 + 0 * mmsize]
+ movu m2, [r1 + 1 * mmsize]
+ movu m3, [r1 + 2 * mmsize]
+ movu m4, [r1 + 3 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0], m1
+ movu [r0 + 16], m2
+ movu [r0 + 32], m3
+ movu [r0 + 48], m4
+
+ ; Row 4-7
+ movu m1, [r1 + 4 * mmsize]
+ movu m2, [r1 + 5 * mmsize]
+ movu m3, [r1 + 6 * mmsize]
+ movu m4, [r1 + 7 * mmsize]
+ psllw m1, m0
+ psllw m2, m0
+ psllw m3, m0
+ psllw m4, m0
+ movu [r0 + r2], m1
+ movu [r0 + r2 + 16], m2
+ movu [r0 + r2 + 32], m3
+ movu [r0 + r2 + 48], m4
+
+ add r1, 8 * mmsize
+ lea r0, [r0 + r2 * 2]
+ dec r3d
+ jnz .loop
+ RET
diff -r 2667a0e3afdc -r 61f7c056cd6e source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Sep 02 15:31:10 2014 +0530
+++ b/source/common/x86/blockcopy8.h Tue Sep 02 17:47:11 2014 +0530
@@ -39,6 +39,10 @@
void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
+void x265_copy_shl_4_sse2(int16_t * dst, int16_t *src, intptr_t, int);
+void x265_copy_shl_8_sse2(int16_t * dst, int16_t *src, intptr_t, int);
+void x265_copy_shl_16_sse2(int16_t * dst, int16_t *src, intptr_t, int);
+void x265_copy_shl_32_sse2(int16_t * dst, int16_t *src, intptr_t, int);
uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
diff -r 2667a0e3afdc -r 61f7c056cd6e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Sep 02 15:31:10 2014 +0530
+++ b/source/test/pixelharness.cpp Tue Sep 02 17:47:11 2014 +0530
@@ -634,6 +634,34 @@
return true;
}
+bool PixelHarness::check_copy_shl_t(copy_shl_t ref, copy_shl_t opt)
+{
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int shift = (rand() % 7 + 1);
+
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1432,6 +1460,16 @@
return false;
}
}
+
+ if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ {
+ if (!check_copy_shl_t(ref.copy_shl[i], opt.copy_shl[i]))
+ {
+ printf("copy_shl[%dx%d] failed!\n", 4 << i, 4 << i);
+ return false;
+ }
+ }
+
}
if (opt.cvt32to16_shr)
@@ -1821,6 +1859,13 @@
HEADER("copy_cnt[%dx%d]", 4 << i, 4 << i);
REPORT_SPEEDUP(opt.copy_cnt[i], ref.copy_cnt[i], sbuf1, sbuf2, STRIDE);
}
+
+ if ((i < BLOCK_64x64) && opt.copy_shl[i])
+ {
+ HEADER("copy_shl[%dx%d]", 4 << i, 4 << i);
+ REPORT_SPEEDUP(opt.copy_shl[i], ref.copy_shl[i], sbuf1, sbuf2, STRIDE, 64);
+ }
+
}
if (opt.cvt32to16_shr)
diff -r 2667a0e3afdc -r 61f7c056cd6e source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Sep 02 15:31:10 2014 +0530
+++ b/source/test/pixelharness.h Tue Sep 02 17:47:11 2014 +0530
@@ -90,6 +90,7 @@
bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
+ bool check_copy_shl_t(copy_shl_t ref, copy_shl_t opt);
bool check_pixel_var(var_t ref, var_t opt);
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
More information about the x265-devel
mailing list