[x265] [PATCH] added copy_shr primitive
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 2 16:13:08 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1409652070 -19800
# Node ID 2667a0e3afdc2b95ff73c962b3e25366162d8e8d
# Parent 16de8fd2837c853c974f83f9aba9e8ef09c2fe2b
added copy_shr primitive
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/pixel.cpp Tue Sep 02 15:31:10 2014 +0530
@@ -470,6 +470,22 @@
}
}
+void copy_shr(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size)
+{
+ int round = 1 << (shift - 1);
+
+ for (int i = 0; i < size; i++)
+ {
+ for (int j = 0; j < size; j++)
+ {
+ dst[j] = (int16_t)((src[j] + round) >> shift);
+ }
+
+ src += size;
+ dst += stride;
+ }
+}
+
template<int size>
void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
{
@@ -1213,6 +1229,8 @@
p.cvt32to16_shl[BLOCK_16x16] = convert32to16_shl<16>;
p.cvt32to16_shl[BLOCK_32x32] = convert32to16_shl<32>;
+ p.copy_shr = copy_shr;
+
p.sa8d[BLOCK_4x4] = satd_4x4;
p.sa8d[BLOCK_8x8] = sa8d_8x8;
p.sa8d[BLOCK_16x16] = sa8d_16x16;
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/primitives.h
--- a/source/common/primitives.h Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/primitives.h Tue Sep 02 15:31:10 2014 +0530
@@ -154,6 +154,7 @@
typedef void (*cvt32to16_shr_t)(int16_t *dst, int32_t *src, intptr_t, int, int);
typedef void (*cvt32to16_shl_t)(int16_t *dst, int32_t *src, intptr_t, int);
typedef uint32_t (*copy_cnt_t)(int16_t* coeff, int16_t* residual, intptr_t stride);
+typedef void (*copy_shr_t)(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
@@ -227,6 +228,7 @@
cvt32to16_shr_t cvt32to16_shr;
cvt32to16_shl_t cvt32to16_shl[NUM_SQUARE_BLOCKS - 1];
copy_cnt_t copy_cnt[NUM_SQUARE_BLOCKS - 1];
+ copy_shr_t copy_shr;
copy_pp_t luma_copy_pp[NUM_LUMA_PARTITIONS];
copy_sp_t luma_copy_sp[NUM_LUMA_PARTITIONS];
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Sep 02 15:31:10 2014 +0530
@@ -1670,6 +1670,7 @@
INTRA_ANG_SSE4(sse4);
p.dct[DCT_8x8] = x265_dct8_sse4;
+ p.copy_shr = x265_copy_shr_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Sep 02 15:31:10 2014 +0530
@@ -4400,3 +4400,79 @@
psadbw xm0, xm4
movd eax, xm0
RET
+
+;-----------------------------------------------------------------------------
+; void copy_shr(short *dst, short *src, intptr_t stride, int shift, int size)
+;-----------------------------------------------------------------------------
+
+INIT_XMM sse4
+cglobal copy_shr, 4, 7, 4, dst, src, stride
+%define rnd m2
+%define shift m1
+
+ ; make shift
+ mov r5d, r3m
+ movd shift, r5d
+
+ ; make round
+ dec r5
+ xor r6, r6
+ bts r6, r5
+
+ movd rnd, r6d
+ pshufd rnd, rnd, 0
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride * 2 (short*)
+ ; r3 - lx
+ ; r4 - size
+ ; r5 - ly
+ ; r6 - diff
+ add r2d, r2d
+
+ mov r4d, r4m
+ mov r5, r4 ; size
+ mov r6, r2 ; stride
+ sub r6, r4
+ add r6, r6
+
+ shr r5, 1
+.loop_row:
+
+ mov r3, r4
+ shr r3, 2
+.loop_col:
+ ; row 0
+ movh m3, [r1]
+ pmovsxwd m0, m3
+ paddd m0, rnd
+ psrad m0, shift
+ packssdw m0, m0
+ movh [r0], m0
+
+ ; row 1
+ movh m3, [r1 + r4 * 2]
+ pmovsxwd m0, m3
+ paddd m0, rnd
+ psrad m0, shift
+ packssdw m0, m0
+ movh [r0 + r2], m0
+
+ ; move col pointer
+ add r1, 8
+ add r0, 8
+
+ dec r3
+ jg .loop_col
+
+ ; update pointer
+ lea r1, [r1 + r4 * 2]
+ add r0, r6
+
+ ; end of loop_row
+ dec r5
+ jg .loop_row
+
+ RET
diff -r 16de8fd2837c -r 2667a0e3afdc source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Tue Sep 02 14:38:41 2014 +0530
+++ b/source/common/x86/blockcopy8.h Tue Sep 02 15:31:10 2014 +0530
@@ -38,6 +38,7 @@
void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
+void x265_copy_shr_sse4(int16_t * dst, int16_t *src, intptr_t, int, int);
uint32_t x265_copy_cnt_4_sse4(int16_t * dst, int16_t * src, intptr_t);
uint32_t x265_copy_cnt_8_sse4(int16_t * dst, int16_t * src, intptr_t);
uint32_t x265_copy_cnt_16_sse4(int16_t * dst, int16_t * src, intptr_t);
diff -r 16de8fd2837c -r 2667a0e3afdc source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Sep 02 14:38:41 2014 +0530
+++ b/source/test/pixelharness.cpp Tue Sep 02 15:31:10 2014 +0530
@@ -606,6 +606,34 @@
return true;
}
+bool PixelHarness::check_copy_shr_t(copy_shr_t ref, copy_shr_t opt)
+{
+ ALIGN_VAR_16(int16_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int16_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+ intptr_t stride = STRIDE;
+ for (int i = 0; i < ITERS; i++)
+ {
+ int shift = (rand() % 7 + 1);
+
+ int index = i % TEST_CASES;
+ checked(opt, opt_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+ ref(ref_dest, short_test_buff[index] + j, stride, shift, (int)STRIDE);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int16_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_pixelavg_pp(pixelavg_pp_t ref, pixelavg_pp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1541,6 +1569,15 @@
}
}
+ if (opt.copy_shr)
+ {
+ if (!check_copy_shr_t(ref.copy_shr, opt.copy_shr))
+ {
+ printf("copy_shr failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -1875,4 +1912,11 @@
HEADER0("planecopy_cp");
REPORT_SPEEDUP(opt.planecopy_cp, ref.planecopy_cp, uchar_test_buff[0], 64, pbuf1, 64, 64, 64, 2);
}
+
+ if (opt.copy_shr)
+ {
+ HEADER0("copy_shr");
+ REPORT_SPEEDUP(opt.copy_shr, ref.copy_shr, sbuf1, sbuf2, 64, 5, 64);
+ }
+
}
diff -r 16de8fd2837c -r 2667a0e3afdc source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Sep 02 14:38:41 2014 +0530
+++ b/source/test/pixelharness.h Tue Sep 02 15:31:10 2014 +0530
@@ -89,6 +89,7 @@
bool check_cvt16to32_shr_t(cvt16to32_shr_t ref, cvt16to32_shr_t opt);
bool check_cvt32to16_shl_t(cvt32to16_shl_t ref, cvt32to16_shl_t opt);
bool check_copy_cnt_t(copy_cnt_t ref, copy_cnt_t opt);
+ bool check_copy_shr_t(copy_shr_t ref, copy_shr_t opt);
bool check_pixel_var(var_t ref, var_t opt);
bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
More information about the x265-devel
mailing list