[x265] [PATCH 2 of 2] asm: fix buffer overwrite bug in upShift_8_avx2
Min Chen
chenm003 at 163.com
Thu Jul 2 02:32:16 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1435795552 25200
# Node ID f57693f3e4b7a548336eee93422b4b76c68a372e
# Parent 42547f20aca0aeafb0305789728b936cb07107ec
asm: fix buffer overwrite bug in upShift_8_avx2
---
source/common/x86/pixel-a.asm | 103 ++++++++++++++---------------------------
1 files changed, 35 insertions(+), 68 deletions(-)
diff -r 42547f20aca0 -r f57693f3e4b7 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Jul 01 17:05:48 2015 -0700
+++ b/source/common/x86/pixel-a.asm Wed Jul 01 17:05:52 2015 -0700
@@ -69,6 +69,7 @@
cextern popcnt_table
cextern pd_2
cextern hmul_16p
+cextern pb_movemask
;=============================================================================
; SATD
@@ -7299,10 +7300,9 @@
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal upShift_8, 7,7,3
-
- movd m2, r6d ; m0 = shift
- add r3, r3
+cglobal upShift_8, 6,7,3
+ movd m2, r6m ; m0 = shift
+ add r3d, r3d
dec r5d
.loopH:
@@ -7393,88 +7393,55 @@
;---------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_YMM avx2
-cglobal upShift_8, 7,8,3
- movd xm2, r6d
- add r3, r3
+cglobal upShift_8, 6,7,4
+ movd xm2, r6m
+ add r3d, r3d
+ dec r5d
.loopH:
- xor r7, r7
- mov r6d, r4d
+ xor r6, r6
.loopW:
- pmovzxbw m0,[r0 + r7]
- pmovzxbw m1,[r0 + r7 + 16]
+ pmovzxbw m0,[r0 + r6]
+ pmovzxbw m1,[r0 + r6 + mmsize/2]
psllw m0, xm2
psllw m1, xm2
- movu [r2 + r7 * 2], m0
- movu [r2 + r7 * 2 + 32], m1
-
- add r7d, 32
- sub r6d, 32
- jg .loopW
+ movu [r2 + r6 * 2], m0
+ movu [r2 + r6 * 2 + mmsize], m1
+
+ add r6d, mmsize
+ cmp r6d, r4d
+ jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
- jnz .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
-.loop16:
+ jg .loopH
+
+ ; processing last row of every frame [To handle width which not a multiple of 32]
+ lea r3, [pb_movemask + 16]
+ mov r5d, 15
+ and r5d, r4d
+ sub r3, r5
+ pmovsxbw m3, [r3]
+
+ ; NOTE: Width MUST BE more than or equal to 16
+ shr r4d, 4
+.loopW2:
pmovzxbw m0,[r0]
psllw m0, xm2
movu [r2], m0
-
- add r0, mmsize
- add r2, 2 * mmsize
- sub r4d, 16
- jg .loop16
- jz .end
-
- cmp r4d, 8
- jl .process4
+ add r0, mmsize/2
+ add r2, mmsize
+ dec r4d
+ jg .loopW2
+
+.nextW2:
+ ; process partial of 16
pmovzxbw m0,[r0]
psllw m0, xm2
+ vpblendvb m0, m0, [r2], m3
movu [r2], m0
-
- add r0, 8
- add r2, mmsize
- sub r4d, 8
- jz .end
-
-.process4:
- cmp r4d, 4
- jl .process2
- movq xm0,[r0]
- pmovzxbw m0,xm0
- psllw xm0, xm2
- movq [r2], xm0
-
- add r0, 4
- add r2, 8
- sub r4d, 4
- jz .end
-
-.process2:
- cmp r4d, 2
- jl .process1
- movzx r3d, byte [r0]
- shl r3d, 2
- mov [r2], r3w
- movzx r3d, byte [r0 + 1]
- shl r3d, 2
- mov [r2 + 2], r3w
-
- add r0, 2
- add r2, 4
- sub r4d, 2
- jz .end
-
-.process1:
- movzx r3d, byte [r0]
- shl r3d, 2
- mov [r2], r3w
-.end:
RET
%endif
More information about the x265-devel
mailing list