[x265] [PATCH] asm: assembly code for cvt16to32_shl
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Nov 28 14:22:31 CET 2013
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1385644066 -19800
# Thu Nov 28 18:37:46 2013 +0530
# Node ID c11165c61c98ad7d5353be480ba6a5f9e3d0df46
# Parent 04cf7a0fbdae38b011447c2b63c4911d0d10b6ba
asm: assembly code for cvt16to32_shl
diff -r 04cf7a0fbdae -r c11165c61c98 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 28 16:55:11 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 28 18:37:46 2013 +0530
@@ -558,6 +558,7 @@
SA8D_INTER_FROM_BLOCK(sse2);
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
+ p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
p.ipfilter_ss[FILTER_V_S_S_8] = x265_interp_8tap_v_ss_sse2;
p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
diff -r 04cf7a0fbdae -r c11165c61c98 source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Nov 28 16:55:11 2013 +0530
+++ b/source/common/x86/blockcopy8.h Thu Nov 28 18:37:46 2013 +0530
@@ -25,6 +25,7 @@
#define X265_BLOCKCOPY8_H
void x265_cvt32to16_shr_sse2(int16_t *dst, int *src, intptr_t, int, int);
+void x265_cvt16to32_shl_sse2(int32_t* dst, int16_t *src, intptr_t, int32_t, int32_t);
#define SETUP_CHROMA_BLOCKCOPY_FUNC(W, H, cpu) \
void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
diff -r 04cf7a0fbdae -r c11165c61c98 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Thu Nov 28 16:55:11 2013 +0530
+++ b/source/common/x86/pixel-util.asm Thu Nov 28 18:37:46 2013 +0530
@@ -110,6 +110,45 @@
RET
+;--------------------------------------------------------------------------------------
+; void cvt16to32_shl(int32_t *dst, int16_t *src, intptr_t stride, int shift, int size);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal cvt16to32_shl, 5, 7, 2, dst, src, stride, shift, size
+%define shift m6
+
+ ; make shift
+ mov r5d, r3m
+ movd shift, r5d
+
+ ; register alloc
+ ; r0 - dst
+ ; r1 - src
+ ; r2 - stride
+ ; r3 - shift
+ ; r4 - size
+
+ mov r5d, r4d
+ shr r4d, 2
+.loop_row
+ mov r6d, r4d
+
+.loop_col
+ pmovsxwd m0, [r1]
+ pslld m0, shift
+ movu [r0], m0
+
+ add r1, 8
+ add r0, 16
+
+ dec r6d
+ jnz .loop_col
+
+ dec r5d
+ jnz .loop_row
+
+ RET
+
;-----------------------------------------------------------------------------
; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list