[x265] fix copy16to16_shl
Satoshi Nakagawa
nakagawa424 at oki.com
Fri Nov 21 01:51:34 CET 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1416530954 -32400
# Fri Nov 21 09:49:14 2014 +0900
# Node ID d330d286791b70b00436cee62a1956634635a638
# Parent 2abf89f5c4f2b797705f3b6e8d6670962daa38b9
fix copy16to16_shl
diff -r 2abf89f5c4f2 -r d330d286791b source/common/pixel.cpp
--- a/source/common/pixel.cpp Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/pixel.cpp Fri Nov 21 09:49:14 2014 +0900
@@ -493,11 +493,12 @@
void copy16to16_shl(int16_t *dst, const int16_t *src, intptr_t stride, int shift, int size)
{
+ X265_CHECK(!(size & 3), "invalid size\n");
for (int i = 0; i < size; i++)
{
for (int j = 0; j < size; j++)
{
- dst[i * size + j] = (src[i * stride + j]) << shift;
+ dst[i * size + j] = src[i * stride + j] << shift;
}
}
}
diff -r 2abf89f5c4f2 -r d330d286791b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/asm-primitives.cpp Fri Nov 21 09:49:14 2014 +0900
@@ -1548,6 +1548,7 @@
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
SA8D_INTER_FROM_BLOCK(sse2);
+ p.cpy16to16_shl = x265_copy16to16_shl_sse2;
p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_sse2;
p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_sse2;
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_sse2;
@@ -1614,7 +1615,6 @@
LUMA_ADDAVG(_sse4);
CHROMA_ADDAVG(_sse4);
CHROMA_ADDAVG_422(_sse4);
- p.cpy16to16_shl = x265_copy16to16_shl_sse4;
p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_sse4;
p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_sse4;
p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_sse4;
diff -r 2abf89f5c4f2 -r d330d286791b source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.asm Fri Nov 21 09:49:14 2014 +0900
@@ -3672,37 +3672,35 @@
;--------------------------------------------------------------------------------------
; void copy16to16_shl(int16_t *dst, int16_t *src, intptr_t stride, int shift, int size);
;--------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal copy16to16_shl, 5, 7, 2, dst, src, stride, shift, size
+INIT_XMM sse2
+cglobal copy16to16_shl, 5, 6, 2, dst, src, stride, shift, size
%define shift m1
; make shift
- mov r5d, r3m
- movd shift, r5d
+ movd shift, r3d
; register alloc
; r0 - dst
; r1 - src
; r2 - stride
- ; r3 - shift
; r4 - size
sub r2d, r4d
add r2d, r2d
mov r5d, r4d
- shr r4d, 3
+ shr r4d, 2
.loop_row:
- mov r6d, r4d
+ mov r3d, r4d
.loop_col:
- movu m0, [r1]
+ movh m0, [r1]
psllw m0, shift
- movu [r0], m0
-
- add r1, 16
- add r0, 16
-
- dec r6d
+ movh [r0], m0
+
+ add r1, 8
+ add r0, 8
+
+ dec r3d
jnz .loop_col
add r1, r2
diff -r 2abf89f5c4f2 -r d330d286791b source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Nov 20 14:31:04 2014 -0800
+++ b/source/common/x86/blockcopy8.h Fri Nov 21 09:49:14 2014 +0900
@@ -32,7 +32,7 @@
void x265_cvt32to16_shl_8_avx2(int16_t* dst, const int* src, intptr_t, int);
void x265_cvt32to16_shl_16_avx2(int16_t* dst, const int* src, intptr_t, int);
void x265_cvt32to16_shl_32_avx2(int16_t* dst, const int* src, intptr_t, int);
-void x265_copy16to16_shl_sse4(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
+void x265_copy16to16_shl_sse2(int16_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_4_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_8_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
void x265_cvt16to32_shr_16_sse4(int32_t* dst, const int16_t* src, intptr_t, int32_t, int32_t);
More information about the x265-devel
mailing list