[x265] [PATCH] Bug fix for luma vpp asm routines.Also incorporated review comment changes

Steve Borho steve at borho.org
Thu Nov 7 23:11:19 CET 2013


On Thu, Nov 7, 2013 at 9:41 AM, <nabajit at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Nabajit Deka
> # Date 1383838838 -19800
> #      Thu Nov 07 21:10:38 2013 +0530
> # Node ID a56c53581344df95e54f9cda919419f1d1ad0850
> # Parent  85002898f5b4308547af6ce464bbdff5f360fa13
> Bug fix for luma vpp asm routines.Also incorporated review comment changes.
>

Great, now the luma vpp assembly functions are enabled for motion
compensation in the encoder.

It would be really helpful if we could catch issues like this in our
testbench.  Perhaps we need to brainstorm a bit on how to do that.


> diff -r 85002898f5b4 -r a56c53581344 source/common/x86/ipfilter8.asm
> --- a/source/common/x86/ipfilter8.asm   Thu Nov 07 14:31:05 2013 +0530
> +++ b/source/common/x86/ipfilter8.asm   Thu Nov 07 21:10:38 2013 +0530
> @@ -2188,17 +2188,17 @@
>      movd        m0, [r0 + 2 * r1]
>      punpcklbw   m1, m0                         ; m1=[1 2]
>      punpcklqdq  m2, m1                         ; m2=[0 1 1 2]
> -    pmaddubsw   m7, m2, [r6 + 0 * 16]          ; m7 = [0+1 1+2]
> +    pmaddubsw   m4, m2, [r6 + 0 * 16]          ; m4=[0+1 1+2]
>
>      lea         r0, [r0 + 2 * r1]
>      movd        m1, [r0 + r1]
> -    punpcklbw   m6, m0, m1                     ; m2=[2 3]
> +    punpcklbw   m5, m0, m1                     ; m2=[2 3]
>      movd        m0, [r0 + 2 * r1]
>      punpcklbw   m1, m0                         ; m1=[3 4]
> -    punpcklqdq  m6, m1                         ; m6=[2 3 3 4]
> -    pmaddubsw   m2, m6, [r6 + 1 * 16]          ; m2 = [2+3 3+4]
> -    paddw       m7, m2                         ; m7=[0+1+2+3 1+2+3+4]
>  Row1-2
> -    pmaddubsw   m6, [r6 + 0 * 16]              ; m6 = [2+3 3+4]
>  Row3-4
> +    punpcklqdq  m5, m1                         ; m5=[2 3 3 4]
> +    pmaddubsw   m2, m5, [r6 + 1 * 16]          ; m2=[2+3 3+4]
> +    paddw       m4, m2                         ; m4=[0+1+2+3 1+2+3+4]
>  Row1-2
> +    pmaddubsw   m5, [r6 + 0 * 16]              ; m5=[2+3 3+4]
>  Row3-4
>
>      lea         r0, [r0 + 2 * r1]
>      movd        m1, [r0 + r1]
> @@ -2206,10 +2206,10 @@
>      movd        m0, [r0 + 2 * r1]
>      punpcklbw   m1, m0                         ; m1=[5 6]
>      punpcklqdq  m2, m1                         ; m2=[4 5 5 6]
> -    pmaddubsw   m1, m2, [r6 + 2 * 16]          ; m1 = [4+5 5+6]
> -    paddw       m7, m1                         ; m7=[0+1+2+3+4+5
> 1+2+3+4+5+6]      Row1-2
> -    pmaddubsw   m2, [r6 + 1 * 16]              ; m2 = [4+5 5+6]
> -    paddw       m6, m2                         ; m6=[2+3+4+5 3+4+5+6]
>          Row3-4
> +    pmaddubsw   m1, m2, [r6 + 2 * 16]          ; m1=[4+5 5+6]
> +    paddw       m4, m1                         ; m4=[0+1+2+3+4+5
> 1+2+3+4+5+6]      Row1-2
> +    pmaddubsw   m2, [r6 + 1 * 16]              ; m2=[4+5 5+6]
> +    paddw       m5, m2                         ; m5=[2+3+4+5 3+4+5+6]
>          Row3-4
>
>      lea         r0, [r0 + 2 * r1]
>      movd        m1, [r0 + r1]
> @@ -2217,10 +2217,10 @@
>      movd        m0, [r0 + 2 * r1]
>      punpcklbw   m1, m0                         ; m1=[7 8]
>      punpcklqdq  m2, m1                         ; m2=[6 7 7 8]
> -    pmaddubsw   m1, m2, [r6 + 3 * 16]          ; m1 = [6+7 7+8]
> -    paddw       m7, m1                         ; m7=[0+1+2+3+4+5+6+7
> 1+2+3+4+5+6+7+8]   Row1-2 end
> +    pmaddubsw   m1, m2, [r6 + 3 * 16]          ; m1=[6+7 7+8]
> +    paddw       m4, m1                         ; m4=[0+1+2+3+4+5+6+7
> 1+2+3+4+5+6+7+8]   Row1-2 end
>      pmaddubsw   m2, [r6 + 2 * 16]              ; m2=[6+7 7+8]
> -    paddw       m6, m2                         ; m6=[2+3+4+5+6+7
> 3+4+5+6+7+8]           Row3-4
> +    paddw       m5, m2                         ; m5=[2+3+4+5+6+7
> 3+4+5+6+7+8]           Row3-4
>
>      lea         r0, [r0 + 2 * r1]
>      movd        m1, [r0 + r1]
> @@ -2228,30 +2228,30 @@
>      movd        m0, [r0 + 2 * r1]
>      punpcklbw   m1, m0                         ; m1=[9 10]
>      punpcklqdq  m2, m1                         ; m2=[8 9 9 10]
> -    pmaddubsw   m2, [r6 + 3 * 16]              ; m2 = [8+9 9+10]
> -    paddw       m6, m2                         ; m6=[2+3+4+5+6+7+8+9
> 3+4+5+6+7+8+9+10]  Row3-4 end
> +    pmaddubsw   m2, [r6 + 3 * 16]              ; m2=[8+9 9+10]
> +    paddw       m5, m2                         ; m5=[2+3+4+5+6+7+8+9
> 3+4+5+6+7+8+9+10]  Row3-4 end
>  %endmacro
>
>  %macro PROCESS_LUMA_W8_4R 0
>      movq       m0, [r0]
>      movq       m1, [r0 + r1]
>      punpcklbw  m0, m1
> -    pmaddubsw  m7, m0, [r6 + 0 *16]            ;m7 = [0+1]  Row1
> +    pmaddubsw  m7, m0, [r6 + 0 *16]            ;m7=[0+1]  Row1
>
>      movq       m0, [r0 + 2 * r1]
>      punpcklbw  m1, m0
> -    pmaddubsw  m6, m1, [r6 + 0 *16]            ;m6 = [1+2]  Row2
> +    pmaddubsw  m6, m1, [r6 + 0 *16]            ;m6=[1+2]  Row2
>
>      lea        r0, [r0 + 2 * r1]
>      movq       m1, [r0 + r1]
>      punpcklbw  m0, m1
> -    pmaddubsw  m5, m0, [r6 + 0 *16]            ;m5 = [2+3]  Row3
> +    pmaddubsw  m5, m0, [r6 + 0 *16]            ;m5=[2+3]  Row3
>      pmaddubsw  m0, [r6 + 1 * 16]
> -    paddw      m7, m0                          ;m7 = [0+1+2+3]  Row1
> +    paddw      m7, m0                          ;m7=[0+1+2+3]  Row1
>
>      movq       m0, [r0 + 2 * r1]
>      punpcklbw  m1, m0
> -    pmaddubsw  m4, m1, [r6 + 0 *16]            ;m4 = [3+4]  Row4
> +    pmaddubsw  m4, m1, [r6 + 0 *16]            ;m4=[3+4]  Row4
>      pmaddubsw  m1, [r6 + 1 * 16]
>      paddw      m6, m1                          ;m6 = [1+2+3+4]  Row2
>
> @@ -2260,41 +2260,41 @@
>      punpcklbw  m0, m1
>      pmaddubsw  m2, m0, [r6 + 1 * 16]
>      pmaddubsw  m0, [r6 + 2 * 16]
> -    paddw      m7, m0                          ;m7 = [0+1+2+3+4+5]  Row1
> -    paddw      m5, m2                          ;m5 = [2+3+4+5]  Row3
> +    paddw      m7, m0                          ;m7=[0+1+2+3+4+5]  Row1
> +    paddw      m5, m2                          ;m5=[2+3+4+5]  Row3
>
>      movq       m0, [r0 + 2 * r1]
>      punpcklbw  m1, m0
>      pmaddubsw  m2, m1, [r6 + 1 * 16]
>      pmaddubsw  m1, [r6 + 2 * 16]
> -    paddw      m6, m1                          ;m6 = [1+2+3+4+5+6]  Row2
> -    paddw      m4, m2                          ;m4 = [3+4+5+6]  Row4
> +    paddw      m6, m1                          ;m6=[1+2+3+4+5+6]  Row2
> +    paddw      m4, m2                          ;m4=[3+4+5+6]  Row4
>
>      lea        r0, [r0 + 2 * r1]
>      movq       m1, [r0 + r1]
>      punpcklbw  m0, m1
>      pmaddubsw  m2, m0, [r6 + 2 * 16]
>      pmaddubsw  m0, [r6 + 3 * 16]
> -    paddw      m7, m0                          ;m7 = [0+1+2+3+4+5+6+7]
>  Row1 end
> -    paddw      m5, m2                          ;m5 = [2+3+4+5+6+7]  Row3
> +    paddw      m7, m0                          ;m7=[0+1+2+3+4+5+6+7]
>  Row1 end
> +    paddw      m5, m2                          ;m5=[2+3+4+5+6+7]  Row3
>
>      movq       m0, [r0 + 2 * r1]
>      punpcklbw  m1, m0
>      pmaddubsw  m2, m1, [r6 + 2 * 16]
>      pmaddubsw  m1, [r6 + 3 * 16]
> -    paddw      m6, m1                          ;m6 = [1+2+3+4+5+6+7+8]
>  Row2 end
> -    paddw      m4, m2                          ;m4 = [3+4+5+6+7+8]  Row4
> +    paddw      m6, m1                          ;m6=[1+2+3+4+5+6+7+8]
>  Row2 end
> +    paddw      m4, m2                          ;m4=[3+4+5+6+7+8]  Row4
>
>      lea        r0, [r0 + 2 * r1]
>      movq       m1, [r0 + r1]
>      punpcklbw  m0, m1
>      pmaddubsw  m0, [r6 + 3 * 16]
> -    paddw      m5, m0                          ;m5 = [2+3+4+5+6+7+8+9]
>  Row3 end
> +    paddw      m5, m0                          ;m5=[2+3+4+5+6+7+8+9]
>  Row3 end
>
>      movq       m0, [r0 + 2 * r1]
>      punpcklbw  m1, m0
>      pmaddubsw  m1, [r6 + 3 * 16]
> -    paddw      m4, m1                          ;m4 = [3+4+5+6+7+8+9+10]
>  Row4 end
> +    paddw      m4, m1                          ;m4=[3+4+5+6+7+8+9+10]
>  Row4 end
>  %endmacro
>
>
>  ;-------------------------------------------------------------------------------------------------------------
> @@ -2306,7 +2306,7 @@
>      lea       r5, [r1 + 2 * r1]
>      sub       r0, r5
>      shl       r4d, 6
> -%ifidn %3, ps
> +%ifidn %3,ps
>      add       r3d, r3d
>  %endif
>
> @@ -2317,7 +2317,7 @@
>      lea       r6, [tab_LumaCoeffVer + r4]
>  %endif
>
> -%ifidn %3, pp
> +%ifidn %3,pp
>      mova      m3, [tab_c_512]
>  %else
>      mova      m3, [tab_c_8192]
> @@ -2328,29 +2328,29 @@
>  .loopH
>      PROCESS_LUMA_W4_4R
>
> -%ifidn %3, pp
> -    pmulhrsw  m7, m3
> -    pmulhrsw  m6, m3
> -
> -    packuswb  m7, m7
> -    packuswb  m6, m6
> -
> -    movd      [r2], m7
> -    pshufd    m7, m7, 1
> -    movd      [r2 + r3], m7
> -    movd      [r2 + 2 * r3], m6
> -    pshufd    m6, m6, 1
> +%ifidn %3,pp
> +    pmulhrsw  m4, m3
> +    pmulhrsw  m5, m3
> +
> +    packuswb  m4, m4
> +    packuswb  m5, m5
> +
> +    movd      [r2], m4
> +    pshufd    m4, m4, 1
> +    movd      [r2 + r3], m4
> +    movd      [r2 + 2 * r3], m5
> +    pshufd    m5, m5, 1
>      lea       r5, [r3 + 2 * r3]
> -    movd      [r2 + r5], m6
> +    movd      [r2 + r5], m5
>  %else
> -    psubw  m7, m3
> -    psubw  m6, m3
> -
> -    movlps    [r2], m7
> -    movhps    [r2 + r3], m7
> -    movlps    [r2 + 2 * r3], m6
> +    psubw     m4, m3
> +    psubw     m5, m3
> +
> +    movlps    [r2], m4
> +    movhps    [r2 + r3], m4
> +    movlps    [r2 + 2 * r3], m5
>      lea       r5, [r3 + 2 * r3]
> -    movhps    [r2 + r5], m6
> +    movhps    [r2 + r5], m5
>  %endif
>
>      lea       r5, [4 * r1]
> @@ -2403,7 +2403,7 @@
>      sub       r0, r5
>      shl       r4d, 6
>
> -%ifidn %3, ps
> +%ifidn %3,ps
>      add       r3d, r3d
>  %endif
>
> @@ -2414,7 +2414,7 @@
>      lea       r6, [tab_LumaCoeffVer + r4]
>  %endif
>
> - %ifidn %3, pp
> + %ifidn %3,pp
>      mova      m3, [tab_c_512]
>  %else
>      mova      m3, [tab_c_8192]
> @@ -2425,7 +2425,7 @@
>  .loopH
>      PROCESS_LUMA_W8_4R
>
> -%ifidn %3, pp
> +%ifidn %3,pp
>      pmulhrsw  m7, m3
>      pmulhrsw  m6, m3
>      pmulhrsw  m5, m3
> @@ -2440,16 +2440,16 @@
>      lea       r5, [r3 + 2 * r3]
>      movhps    [r2 + r5], m5
>  %else
> -    psubw  m7, m3
> -    psubw  m6, m3
> -    psubw  m5, m3
> -    psubw  m4, m3
> -
> -    movu    [r2], m7
> -    movu    [r2 + r3], m6
> -    movu    [r2 + 2 * r3], m5
> -    lea     r5, [r3 + 2 * r3]
> -    movu    [r2 + r5], m4
> +    psubw     m7, m3
> +    psubw     m6, m3
> +    psubw     m5, m3
> +    psubw     m4, m3
> +
> +    movu      [r2], m7
> +    movu      [r2 + r3], m6
> +    movu      [r2 + 2 * r3], m5
> +    lea       r5, [r3 + 2 * r3]
> +    movu      [r2 + r5], m4
>  %endif
>
>      lea       r5, [4 * r1]
> @@ -2511,7 +2511,7 @@
>      lea       r5, [r1 + 2 * r1]
>      sub       r0, r5
>      shl       r4d, 6
> -%ifidn %3, ps
> +%ifidn %3,ps
>      add       r3d, r3d
>  %endif
>
> @@ -2522,7 +2522,7 @@
>      lea       r6, [tab_LumaCoeffVer + r4]
>  %endif
>
> - %ifidn %3, pp
> + %ifidn %3,pp
>      mova      m3, [tab_c_512]
>  %else
>      mova      m3, [tab_c_8192]
> @@ -2533,7 +2533,7 @@
>  .loopH
>      PROCESS_LUMA_W8_4R
>
> -%ifidn %3, pp
> +%ifidn %3,pp
>      pmulhrsw  m7, m3
>      pmulhrsw  m6, m3
>      pmulhrsw  m5, m3
> @@ -2548,21 +2548,21 @@
>      lea       r5, [r3 + 2 * r3]
>      movhps    [r2 + r5], m5
>  %else
> -    psubw  m7, m3
> -    psubw  m6, m3
> -    psubw  m5, m3
> -    psubw  m4, m3
> -
> -    movu    [r2], m7
> -    movu    [r2 + r3], m6
> -    movu    [r2 + 2 * r3], m5
> -    lea     r5, [r3 + 2 * r3]
> -    movu    [r2 + r5], m4
> +    psubw     m7, m3
> +    psubw     m6, m3
> +    psubw     m5, m3
> +    psubw     m4, m3
> +
> +    movu      [r2], m7
> +    movu      [r2 + r3], m6
> +    movu      [r2 + 2 * r3], m5
> +    lea       r5, [r3 + 2 * r3]
> +    movu      [r2 + r5], m4
>  %endif
>
>      lea       r5, [8 * r1 - 8]
>      sub       r0, r5
> -%ifidn %3, pp
> +%ifidn %3,pp
>      add       r2, 8
>  %else
>      add       r2, 16
> @@ -2570,34 +2570,34 @@
>
>      PROCESS_LUMA_W4_4R
>
> -%ifidn %3, pp
> -    pmulhrsw  m7, m3
> -    pmulhrsw  m6, m3
> -
> -    packuswb  m7, m7
> -    packuswb  m6, m6
> -
> -    movd      [r2], m7
> -    pshufd    m7, m7, 1
> -    movd      [r2 + r3], m7
> -    movd      [r2 + 2 * r3], m6
> -    pshufd    m6, m6, 1
> +%ifidn %3,pp
> +    pmulhrsw  m4, m3
> +    pmulhrsw  m5, m3
> +
> +    packuswb  m4, m4
> +    packuswb  m5, m5
> +
> +    movd      [r2], m4
> +    pshufd    m4, m4, 1
> +    movd      [r2 + r3], m4
> +    movd      [r2 + 2 * r3], m5
> +    pshufd    m5, m5, 1
>      lea       r5, [r3 + 2 * r3]
> -    movd      [r2 + r5], m6
> +    movd      [r2 + r5], m5
>  %else
> -    psubw  m7, m3
> -    psubw  m6, m3
> -
> -    movlps    [r2], m7
> -    movhps    [r2 + r3], m7
> -    movlps    [r2 + 2 * r3], m6
> +    psubw     m4, m3
> +    psubw     m5, m3
> +
> +    movlps    [r2], m4
> +    movhps    [r2 + r3], m4
> +    movlps    [r2 + 2 * r3], m5
>      lea       r5, [r3 + 2 * r3]
> -    movhps    [r2 + r5], m6
> +    movhps    [r2 + r5], m5
>  %endif
>
>      lea       r5, [4 * r1 + 8]
>      sub       r0, r5
> -%ifidn %3, pp
> +%ifidn %3,pp
>      lea       r2, [r2 + 4 * r3 - 8]
>  %else
>      lea       r2, [r2 + 4 * r3 - 16]
> @@ -2628,7 +2628,7 @@
>      lea       r5, [r1 + 2 * r1]
>      sub       r0, r5
>      shl       r4d, 6
> -%ifidn %3, ps
> +%ifidn %3,ps
>      add       r3d, r3d
>  %endif
>
> @@ -2639,7 +2639,7 @@
>      lea       r6, [tab_LumaCoeffVer + r4]
>  %endif
>
> -%ifidn %3, pp
> +%ifidn %3,pp
>      mova      m3, [tab_c_512]
>  %else
>      mova      m3, [tab_c_8192]
> @@ -2650,7 +2650,7 @@
>      mov       r4d, (%1/8)
>  .loopW
>      PROCESS_LUMA_W8_4R
> -%ifidn %3, pp
> +%ifidn %3,pp
>      pmulhrsw  m7, m3
>      pmulhrsw  m6, m3
>      pmulhrsw  m5, m3
> @@ -2665,30 +2665,30 @@
>      lea       r5, [r3 + 2 * r3]
>      movhps    [r2 + r5], m5
>  %else
> -    psubw  m7, m3
> -    psubw  m6, m3
> -    psubw  m5, m3
> -    psubw  m4, m3
> -
> -    movu    [r2], m7
> -    movu    [r2 + r3], m6
> -    movu    [r2 + 2 * r3], m5
> -    lea     r5, [r3 + 2 * r3]
> -    movu    [r2 + r5], m4
> +    psubw     m7, m3
> +    psubw     m6, m3
> +    psubw     m5, m3
> +    psubw     m4, m3
> +
> +    movu      [r2], m7
> +    movu      [r2 + r3], m6
> +    movu      [r2 + 2 * r3], m5
> +    lea       r5, [r3 + 2 * r3]
> +    movu      [r2 + r5], m4
>  %endif
>
>      lea       r5, [8 * r1 - 8]
>      sub       r0, r5
> -%ifidn %3, pp
> +%ifidn %3,pp
>      add       r2, 8
>  %else
>      add       r2, 16
>  %endif
>      dec       r4d
> -    jnz      .loopW
> +    jnz       .loopW
>
>      lea       r0, [r0 + 4 * r1 - %1]
> -%ifidn %3, pp
> +%ifidn %3,pp
>      lea       r2, [r2 + 4 * r3 - %1]
>  %else
>      lea       r2, [r2 + 4 * r3 - 2 * %1]
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131107/fa5ee55e/attachment-0001.html>


More information about the x265-devel mailing list