[x265] [PATCH] asm: replace mova by movu to avoid AVX2 testbench crash in dct16, dct32, denoise_dct, its same speed on Haswell

Deepthi Nandakumar deepthi at multicorewareinc.com
Tue Sep 23 19:34:34 CEST 2014


Thanks, Min. Pushed. However, I still get the testbench error message -
quantcoeff/dequantcoeff buffer not aligned. Does the above change need to
be reflected to quant/dequant also?

Thanks,
Deepthi

On Wed, Sep 24, 2014 at 12:50 AM, Min Chen <chenm003 at 163.com> wrote:

> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1411499911 25200
> # Node ID 439637e2e34800ba31dbfe28946946264af39380
> # Parent  ee76b64fd051b529cc57c4fae7d8b7e0b6f8463e
> asm: replace mova by movu to avoid AVX2 testbench crash in dct16, dct32,
> denoise_dct, its same speed on Haswell
>
> diff -r ee76b64fd051 -r 439637e2e348 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm        Mon Sep 22 21:28:59 2014 +0900
> +++ b/source/common/x86/dct8.asm        Tue Sep 23 12:18:31 2014 -0700
> @@ -1108,17 +1108,17 @@
>      pxor     m5,  m5
>      shr      r3d, 3
>  .loop:
> -    mova     m0, [r0]
> +    movu     m0, [r0]
>      pabsd    m1, m0
> -    mova     m2, [r1]
> +    movu     m2, [r1]
>      paddd    m2, m1
> -    mova     [r1], m2
> +    movu     [r1], m2
>      pmovzxwd m3, [r2]
>      psubd    m1, m3
>      pcmpgtd  m4, m1, m5
>      pand     m1, m4
>      psignd   m1, m0
> -    mova     [r0], m1
> +    movu     [r0], m1
>      add      r0, 32
>      add      r1, 32
>      add      r2, 16
> @@ -1197,10 +1197,10 @@
>  cglobal dct16, 3, 9, 15, 0-16*mmsize
>  %if BIT_DEPTH == 10
>      %define         DCT_SHIFT          5
> -    vpbroadcastd    m9,                [pd_16]
> +    vbroadcasti128  m9,                [pd_16]
>  %elif BIT_DEPTH == 8
>      %define         DCT_SHIFT          3
> -    vpbroadcastd    m9,                [pd_4]
> +    vbroadcasti128  m9,                [pd_4]
>  %else
>      %error Unsupported BIT_DEPTH!
>  %endif
> @@ -1219,23 +1219,23 @@
>  .pass1:
>      lea             r6,                [r0 + r2 * 4]
>
> -    mova            m2,                [r0]
> -    mova            m1,                [r6]
> +    movu            m2,                [r0]
> +    movu            m1,                [r6]
>      vperm2i128      m0,                m2, m1, 0x20        ; [row0lo
> row4lo]
>      vperm2i128      m1,                m2, m1, 0x31        ; [row0hi
> row4hi]
>
> -    mova            m4,                [r0 + r2]
> -    mova            m3,                [r6 + r2]
> +    movu            m4,                [r0 + r2]
> +    movu            m3,                [r6 + r2]
>      vperm2i128      m2,                m4, m3, 0x20        ; [row1lo
> row5lo]
>      vperm2i128      m3,                m4, m3, 0x31        ; [row1hi
> row5hi]
>
> -    mova            m6,                [r0 + r2 * 2]
> -    mova            m5,                [r6 + r2 * 2]
> +    movu            m6,                [r0 + r2 * 2]
> +    movu            m5,                [r6 + r2 * 2]
>      vperm2i128      m4,                m6, m5, 0x20        ; [row2lo
> row6lo]
>      vperm2i128      m5,                m6, m5, 0x31        ; [row2hi
> row6hi]
>
> -    mova            m8,                [r0 + r3]
> -    mova            m7,                [r6 + r3]
> +    movu            m8,                [r0 + r3]
> +    movu            m7,                [r6 + r3]
>      vperm2i128      m6,                m8, m7, 0x20        ; [row3lo
> row7lo]
>      vperm2i128      m7,                m8, m7, 0x31        ; [row3hi
> row7hi]
>
> @@ -1296,7 +1296,7 @@
>      mov             r4d,               2
>      mov             r2d,               64
>      lea             r3,                [r2 * 3]
> -    vpbroadcastd    m9,                [pd_512]
> +    vbroadcasti128  m9,                [pd_512]
>
>  .pass2:
>      mova            m0,                [r5 + 0 * 32]        ; [row0lo
> row4lo]
> @@ -1312,43 +1312,43 @@
>      mova            m7,                [r5 + 11 * 32]       ; [row3hi
> row7hi]
>
>      DCT16_PASS_2    -8 * 16
> -    mova            [r1],              m10
> +    movu            [r1],              m10
>      DCT16_PASS_2    -7 * 16
> -    mova            [r1 + r2],         m10
> +    movu            [r1 + r2],         m10
>      DCT16_PASS_2    -6 * 16
> -    mova            [r1 + r2 * 2],     m10
> +    movu            [r1 + r2 * 2],     m10
>      DCT16_PASS_2    -5 * 16
> -    mova            [r1 + r3],         m10
> +    movu            [r1 + r3],         m10
>
>      lea             r6,                [r1 + r2 * 4]
>      DCT16_PASS_2    -4 * 16
> -    mova            [r6],              m10
> +    movu            [r6],              m10
>      DCT16_PASS_2    -3 * 16
> -    mova            [r6 + r2],         m10
> +    movu            [r6 + r2],         m10
>      DCT16_PASS_2    -2 * 16
> -    mova            [r6 + r2 * 2],     m10
> +    movu            [r6 + r2 * 2],     m10
>      DCT16_PASS_2    -1 * 16
> -    mova            [r6 + r3],         m10
> +    movu            [r6 + r3],         m10
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT16_PASS_2    0 * 16
> -    mova            [r6],              m10
> +    movu            [r6],              m10
>      DCT16_PASS_2    1 * 16
> -    mova            [r6 + r2],         m10
> +    movu            [r6 + r2],         m10
>      DCT16_PASS_2    2 * 16
> -    mova            [r6 + r2 * 2],     m10
> +    movu            [r6 + r2 * 2],     m10
>      DCT16_PASS_2    3 * 16
> -    mova            [r6 + r3],         m10
> +    movu            [r6 + r3],         m10
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT16_PASS_2    4 * 16
> -    mova            [r6],              m10
> +    movu            [r6],              m10
>      DCT16_PASS_2    5 * 16
> -    mova            [r6 + r2],         m10
> +    movu            [r6 + r2],         m10
>      DCT16_PASS_2    6 * 16
> -    mova            [r6 + r2 * 2],     m10
> +    movu            [r6 + r2 * 2],     m10
>      DCT16_PASS_2    7 * 16
> -    mova            [r6 + r3],         m10
> +    movu            [r6 + r3],         m10
>
>      add             r1,                32
>      add             r5,                128
> @@ -1442,15 +1442,15 @@
>      mova            m15,               [dct16_shuf1]
>
>  .pass1:
> -    mova            m2,                [r0]
> -    mova            m1,                [r0 + 32]
> +    movu            m2,                [r0]
> +    movu            m1,                [r0 + 32]
>      pshufb          m1,                m15
>      vpermq          m1,                m1, 0x4E
>      psubw           m7,                m2, m1
>      paddw           m2,                m1
>
> -    mova            m1,                [r0 + r2 * 2]
> -    mova            m0,                [r0 + r2 * 2 + 32]
> +    movu            m1,                [r0 + r2 * 2]
> +    movu            m0,                [r0 + r2 * 2 + 32]
>      pshufb          m0,                m15
>      vpermq          m0,                m0, 0x4E
>      psubw           m8,                m1, m0
> @@ -1465,15 +1465,15 @@
>      vperm2i128      m6,                m7, m8, 0x31        ; [row0hi
> row2hi] for O
>
>
> -    mova            m4,                [r0 + r2]
> -    mova            m2,                [r0 + r2 + 32]
> +    movu            m4,                [r0 + r2]
> +    movu            m2,                [r0 + r2 + 32]
>      pshufb          m2,                m15
>      vpermq          m2,                m2, 0x4E
>      psubw           m10,               m4, m2
>      paddw           m4,                m2
>
> -    mova            m3,                [r0 + r3]
> -    mova            m2,                [r0 + r3 + 32]
> +    movu            m3,                [r0 + r3]
> +    movu            m2,                [r0 + r3 + 32]
>      pshufb          m2,                m15
>      vpermq          m2,                m2, 0x4E
>      psubw           m11,               m3, m2
> @@ -1531,83 +1531,83 @@
>      mova            m7,                [r5 + 3 * 64 + 32]
>
>      DCT32_PASS_2    0 * 32
> -    mova            [r1],              xm11
> +    movu            [r1],              xm11
>      DCT32_PASS_2    1 * 32
> -    mova            [r1 + r2],         xm11
> +    movu            [r1 + r2],         xm11
>      DCT32_PASS_2    2 * 32
> -    mova            [r1 + r2 * 2],     xm11
> +    movu            [r1 + r2 * 2],     xm11
>      DCT32_PASS_2    3 * 32
> -    mova            [r1 + r3],         xm11
> +    movu            [r1 + r3],         xm11
>
>      lea             r6,                [r1 + r2 * 4]
>      DCT32_PASS_2    4 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    5 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    6 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    7 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    8 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    9 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    10 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    11 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    12 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    13 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    14 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    15 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    16 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    17 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    18 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    19 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    20 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    21 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    22 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    23 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    24 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    25 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    26 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    27 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      lea             r6,                [r6 + r2 * 4]
>      DCT32_PASS_2    28 * 32
> -    mova            [r6],              xm11
> +    movu            [r6],              xm11
>      DCT32_PASS_2    29 * 32
> -    mova            [r6 + r2],         xm11
> +    movu            [r6 + r2],         xm11
>      DCT32_PASS_2    30 * 32
> -    mova            [r6 + r2 * 2],     xm11
> +    movu            [r6 + r2 * 2],     xm11
>      DCT32_PASS_2    31 * 32
> -    mova            [r6 + r3],         xm11
> +    movu            [r6 + r3],         xm11
>
>      add             r5,                256
>      add             r1,                16
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140923/01b02578/attachment-0001.html>


More information about the x265-devel mailing list