[x265] [PATCH] asm: fix memory access violation due to scale2D_64to32
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Jan 7 14:15:45 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1389099977 -19800
# Tue Jan 07 18:36:17 2014 +0530
# Node ID a7e28153287ea69e6fc2f68ae181b366315cddf2
# Parent 4811da38078cd02434f7da1dcc1b0af4dcf5adb8
asm: fix memory access violation due to scale2D_64to32
diff -r 4811da38078c -r a7e28153287e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 06 23:15:58 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Tue Jan 07 18:36:17 2014 +0530
@@ -685,7 +685,7 @@
if (cpuMask & X265_CPU_SSSE3)
{
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
- //p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
SETUP_INTRA_ANG4(2, 2, ssse3);
SETUP_INTRA_ANG4(34, 2, ssse3);
@@ -891,7 +891,7 @@
SETUP_INTRA_ANG4(34, 2, ssse3);
p.scale1D_128to64 = x265_scale1D_128to64_ssse3;
- //p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
+ p.scale2D_64to32 = x265_scale2D_64to32_ssse3;
SAD_X3(ssse3);
SAD_X4(ssse3);
diff -r 4811da38078c -r a7e28153287e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jan 06 23:15:58 2014 -0600
+++ b/source/common/x86/pixel-util8.asm Tue Jan 07 18:36:17 2014 +0530
@@ -2325,17 +2325,17 @@
;-----------------------------------------------------------------
; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
+%if HIGH_BIT_DEPTH
INIT_XMM ssse3
cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
mov r3d, 32
-%if HIGH_BIT_DEPTH
mova m7, [deinterleave_word_shuf]
add r2, r2
.loop
movu m0, [r1] ;i
- movu m1, [r1 + 2] ;j
+ psrld m1, m0, 16 ;j
movu m2, [r1 + r2] ;k
- movu m3, [r1 + r2 + 2] ;l
+ psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
@@ -2350,9 +2350,9 @@
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 16] ;i
- movu m2, [r1 + 16 + 2] ;j
+ psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 16] ;k
- movu m4, [r1 + r2 + 16 + 2] ;l
+ psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
@@ -2373,9 +2373,9 @@
movu [r0], m0
movu m0, [r1 + 32] ;i
- movu m1, [r1 + 32 + 2] ;j
+ psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 32] ;k
- movu m3, [r1 + r2 + 32 + 2] ;l
+ psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
@@ -2390,9 +2390,9 @@
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 48] ;i
- movu m2, [r1 + 48 + 2] ;j
+ psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 48] ;k
- movu m4, [r1 + r2 + 48 + 2] ;l
+ psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
@@ -2413,9 +2413,9 @@
movu [r0 + 16], m0
movu m0, [r1 + 64] ;i
- movu m1, [r1 + 64 + 2] ;j
+ psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 64] ;k
- movu m3, [r1 + r2 + 64 + 2] ;l
+ psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
@@ -2430,9 +2430,9 @@
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
movu m1, [r1 + 80] ;i
- movu m2, [r1 + 80 + 2] ;j
+ psrld m2, m1, 16 ;j
movu m3, [r1 + r2 + 80] ;k
- movu m4, [r1 + r2 + 80 + 2] ;l
+ psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
@@ -2453,9 +2453,9 @@
movu [r0 + 32], m0
movu m0, [r1 + 96] ;i
- movu m1, [r1 + 96 + 2] ;j
+ psrld m1, m0, 16 ;j
movu m2, [r1 + r2 + 96] ;k
- movu m3, [r1 + r2 + 96 + 2] ;l
+ psrld m3, m2, 16 ;l
movu m4, m0
movu m5, m2
pxor m4, m1 ;i^j
@@ -2469,10 +2469,10 @@
pand m4, m5 ;(ij|kl)&st
pand m4, [hmulw_16p]
psubw m0, m4 ;Result
- movu m1, [r1 + 112] ;i
- movu m2, [r1 + 112 + 2] ;j
- movu m3, [r1 + r2 + 112] ;k
- movu m4, [r1 + r2 + 112 + 2] ;l
+ movu m1, [r1 + 112] ;i
+ psrld m2, m1, 16 ;j
+ movu m3, [r1 + r2 + 112] ;k
+ psrld m4, m3, 16 ;l
movu m5, m1
movu m6, m3
pxor m5, m2 ;i^j
@@ -2492,14 +2492,22 @@
punpcklqdq m0, m1
movu [r0 + 48], m0
lea r0, [r0 + 64]
+ lea r1, [r1 + 2 * r2]
+ dec r3d
+ jnz .loop
+ RET
%else
+
+INIT_XMM ssse3
+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride
+ mov r3d, 32
mova m7, [deinterleave_shuf]
.loop
movu m0, [r1] ;i
- movu m1, [r1 + 1] ;j
+ psrlw m1, m0, 8 ;j
movu m2, [r1 + r2] ;k
- movu m3, [r1 + r2 + 1] ;l
+ psrlw m3, m2, 8 ;l
movu m4, m0
movu m5, m2
@@ -2517,9 +2525,9 @@
psubb m0, m4 ;Result
movu m1, [r1 + 16] ;i
- movu m2, [r1 + 16 + 1] ;j
+ psrlw m2, m1, 8 ;j
movu m3, [r1 + r2 + 16] ;k
- movu m4, [r1 + r2 + 16 + 1] ;l
+ psrlw m4, m3, 8 ;l
movu m5, m1
movu m6, m3
@@ -2543,9 +2551,9 @@
movu [r0], m0
movu m0, [r1 + 32] ;i
- movu m1, [r1 + 32 + 1] ;j
+ psrlw m1, m0, 8 ;j
movu m2, [r1 + r2 + 32] ;k
- movu m3, [r1 + r2 + 32 + 1] ;l
+ psrlw m3, m2, 8 ;l
movu m4, m0
movu m5, m2
@@ -2563,9 +2571,9 @@
psubb m0, m4 ;Result
movu m1, [r1 + 48] ;i
- movu m2, [r1 + 48 + 1] ;j
+ psrlw m2, m1, 8 ;j
movu m3, [r1 + r2 + 48] ;k
- movu m4, [r1 + r2 + 48 + 1] ;l
+ psrlw m4, m3, 8 ;l
movu m5, m1
movu m6, m3
@@ -2589,12 +2597,11 @@
movu [r0 + 16], m0
lea r0, [r0 + 32]
-%endif
lea r1, [r1 + 2 * r2]
dec r3d
jnz .loop
-
-RET
+ RET
+%endif
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list