[x265] [PATCH 2 of 2] asm: fix buffer overwrite bug in upShift_8_avx2

Min Chen chenm003 at 163.com
Thu Jul 2 02:32:16 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1435795552 25200
# Node ID f57693f3e4b7a548336eee93422b4b76c68a372e
# Parent  42547f20aca0aeafb0305789728b936cb07107ec
asm: fix buffer overwrite bug in upShift_8_avx2
---
 source/common/x86/pixel-a.asm |  103 ++++++++++++++---------------------------
 1 files changed, 35 insertions(+), 68 deletions(-)

diff -r 42547f20aca0 -r f57693f3e4b7 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Wed Jul 01 17:05:48 2015 -0700
+++ b/source/common/x86/pixel-a.asm	Wed Jul 01 17:05:52 2015 -0700
@@ -69,6 +69,7 @@
 cextern popcnt_table
 cextern pd_2
 cextern hmul_16p
+cextern pb_movemask
 
 ;=============================================================================
 ; SATD
@@ -7299,10 +7300,9 @@
 ;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
 ;---------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal upShift_8, 7,7,3
-
-    movd        m2, r6d        ; m0 = shift
-    add         r3, r3
+cglobal upShift_8, 6,7,3
+    movd        m2, r6m        ; m0 = shift
+    add         r3d, r3d
     dec         r5d
 
 .loopH:
@@ -7393,88 +7393,55 @@
 ;---------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_YMM avx2
-cglobal upShift_8, 7,8,3
-    movd        xm2, r6d
-    add         r3, r3
+cglobal upShift_8, 6,7,4
+    movd        xm2, r6m
+    add         r3d, r3d
+    dec         r5d
 
 .loopH:
-    xor         r7, r7
-    mov         r6d, r4d
+    xor         r6, r6
 .loopW:
-    pmovzxbw    m0,[r0 + r7]
-    pmovzxbw    m1,[r0 + r7 + 16]
+    pmovzxbw    m0,[r0 + r6]
+    pmovzxbw    m1,[r0 + r6 + mmsize/2]
     psllw       m0, xm2
     psllw       m1, xm2
-    movu        [r2 + r7 * 2], m0
-    movu        [r2 + r7 * 2 + 32], m1
-
-    add         r7d, 32
-    sub         r6d, 32
-    jg          .loopW
+    movu        [r2 + r6 * 2], m0
+    movu        [r2 + r6 * 2 + mmsize], m1
+
+    add         r6d, mmsize
+    cmp         r6d, r4d
+    jl         .loopW
 
     ; move to next row
     add         r0, r1
     add         r2, r3
     dec         r5d
-    jnz         .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
-.loop16:
+    jg         .loopH
+
+    ; processing last row of every frame [To handle width which not a multiple of 32]
+    lea         r3, [pb_movemask + 16]
+    mov         r5d, 15
+    and         r5d, r4d
+    sub         r3, r5
+    pmovsxbw    m3, [r3]
+
+    ; NOTE: Width MUST BE more than or equal to 16
+    shr         r4d, 4
+.loopW2:
     pmovzxbw    m0,[r0]
     psllw       m0, xm2
     movu        [r2], m0
-
-    add         r0, mmsize
-    add         r2, 2 * mmsize
-    sub         r4d, 16
-    jg          .loop16
-    jz          .end
-
-    cmp         r4d, 8
-    jl          .process4
+    add         r0, mmsize/2
+    add         r2, mmsize
+    dec         r4d
+    jg         .loopW2
+
+.nextW2:
+    ; process partial of 16
     pmovzxbw    m0,[r0]
     psllw       m0, xm2
+    vpblendvb   m0, m0, [r2], m3
     movu        [r2], m0
-
-    add         r0, 8
-    add         r2, mmsize
-    sub         r4d, 8
-    jz          .end
-
-.process4:
-    cmp         r4d, 4
-    jl          .process2
-    movq        xm0,[r0]
-    pmovzxbw    m0,xm0
-    psllw       xm0, xm2
-    movq        [r2], xm0
-
-    add         r0, 4
-    add         r2, 8
-    sub         r4d, 4
-    jz          .end
-
-.process2:
-    cmp         r4d, 2
-    jl          .process1
-    movzx       r3d, byte [r0]
-    shl         r3d, 2
-    mov         [r2], r3w
-    movzx       r3d, byte [r0 + 1]
-    shl         r3d, 2
-    mov         [r2 + 2], r3w
-
-    add         r0, 2
-    add         r2, 4
-    sub         r4d, 2
-    jz          .end
-
-.process1:
-    movzx       r3d, byte [r0]
-    shl         r3d, 2
-    mov         [r2], r3w
-.end:
     RET
 %endif
 



More information about the x265-devel mailing list