[x265] [PATCH] asm: removed some duplicate constants and moved others into const-a.asm

Steve Borho steve at borho.org
Tue May 19 17:17:54 CEST 2015


On 05/19, dnyaneshwar at multicorewareinc.com wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1432028888 -19800
> #      Tue May 19 15:18:08 2015 +0530
> # Node ID b44cdf8dc08c77e84b8707992cd0006bbf23d864
> # Parent  ac32faec79be9c6a60d267086b4563bd884537c0
> asm: removed some duplicate constants and moved others into const-a.asm

looks fine, but is not applying on the tip

> diff -r ac32faec79be -r b44cdf8dc08c source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm	Mon May 18 18:03:19 2015 +0530
> +++ b/source/common/x86/const-a.asm	Tue May 19 15:18:08 2015 +0530
> @@ -63,6 +63,8 @@
>  
>  const pw_1,                 times 16 dw 1
>  const pw_2,                 times 16 dw 2
> +const pw_3,                 times 16 dw 3
> +const pw_7,                 times 16 dw 7
>  const pw_m2,                times  8 dw -2
>  const pw_4,                 times  8 dw 4
>  const pw_8,                 times  8 dw 8
> @@ -110,6 +112,7 @@
>  const pd_4,                 times  4 dd 4
>  const pd_8,                 times  4 dd 8
>  const pd_16,                times  4 dd 16
> +const pd_31,                times  4 dd 31
>  const pd_32,                times  4 dd 32
>  const pd_64,                times  4 dd 64
>  const pd_128,               times  4 dd 128
> diff -r ac32faec79be -r b44cdf8dc08c source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm	Mon May 18 18:03:19 2015 +0530
> +++ b/source/common/x86/intrapred16.asm	Tue May 19 15:18:08 2015 +0530
> @@ -44,7 +44,6 @@
>  const pw_punpcklwd,         db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
>  const c_mode32_10_0,        db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1
>  
> -const pw_unpackwdq, times 8 db 0,1
>  const pw_ang8_12,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
>  const pw_ang8_13,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
>  const pw_ang8_14,   db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
> @@ -58,16 +57,6 @@
>  
>  ;; (blkSize - 1 - x)
>  pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
> -pw_planar4_1:         dw 3,  3,  3,  3,  3,  3,  3,  3
> -pw_planar8_0:         dw 7,  6,  5,  4,  3,  2,  1,  0
> -pw_planar8_1:         dw 7,  7,  7,  7,  7,  7,  7,  7
> -pw_planar16_0:        dw 15, 14, 13, 12, 11, 10,  9, 8
> -pw_planar16_1:        dw 15, 15, 15, 15, 15, 15, 15, 15
> -pd_planar32_1:        dd 31, 31, 31, 31
> -
> -pw_planar32_1:        dw 31, 31, 31, 31, 31, 31, 31, 31
> -pw_planar32_L:        dw 31, 30, 29, 28, 27, 26, 25, 24
> -pw_planar32_H:        dw 23, 22, 21, 20, 19, 18, 17, 16
>  
>  const planar32_table
>  %assign x 31
> @@ -85,8 +74,11 @@
>  
>  SECTION .text
>  
> +cextern pb_01
>  cextern pw_1
>  cextern pw_2
> +cextern pw_3
> +cextern pw_7
>  cextern pw_4
>  cextern pw_8
>  cextern pw_15
> @@ -95,6 +87,7 @@
>  cextern pw_32
>  cextern pw_1023
>  cextern pd_16
> +cextern pd_31
>  cextern pd_32
>  cextern pw_4096
>  cextern multiL
> @@ -681,7 +674,7 @@
>      pshufd          m4, m4, 0               ; v_bottomLeft
>  
>      pmullw          m3, [multiL]            ; (x + 1) * topRight
> -    pmullw          m0, m1, [pw_planar8_1]  ; (blkSize - 1 - y) * above[x]
> +    pmullw          m0, m1, [pw_7]          ; (blkSize - 1 - y) * above[x]
>      paddw           m3, [pw_8]
>      paddw           m3, m4
>      paddw           m3, m0
> @@ -695,7 +688,7 @@
>      pshufhw         m1, m2, 0x55 * (%1 - 4)
>      pshufd          m1, m1, 0xAA
>  %endif
> -    pmullw          m1, [pw_planar8_0]
> +    pmullw          m1, [pw_planar16_mul + mmsize]
>      paddw           m1, m3
>      psraw           m1, 4
>      movu            [r0], m1
> @@ -733,8 +726,8 @@
>  
>      pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
>      pmullw          m3, [multiL]                ; (x + 1) * topRight
> -    pmullw          m1, m2, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
> -    pmullw          m5, m7, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
> +    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
> +    pmullw          m5, m7, [pw_15]             ; (blkSize - 1 - y) * above[x]
>      paddw           m4, [pw_16]
>      paddw           m3, [pw_16]
>      paddw           m4, m6
> @@ -770,8 +763,8 @@
>      paddw           m4, m1
>      lea             r0, [r0 + r1 * 2]
>  %endif
> -    pmullw          m0, m5, [pw_planar8_0]
> -    pmullw          m5, [pw_planar16_0]
> +    pmullw          m0, m5, [pw_planar16_mul + mmsize]
> +    pmullw          m5, [pw_planar16_mul]
>      paddw           m0, m4
>      paddw           m5, m3
>      psraw           m5, 5
> @@ -827,7 +820,7 @@
>      mova            m9, m6
>      mova            m10, m6
>  
> -    mova            m12, [pw_planar32_1]
> +    mova            m12, [pw_31]
>      movu            m4, [r2 + 2]
>      psubw           m8, m4
>      pmullw          m4, m12
> @@ -848,10 +841,10 @@
>      pmullw          m5, m12
>      paddw           m3, m5
>  
> -    mova            m12, [pw_planar32_L]
> -    mova            m13, [pw_planar32_H]
> -    mova            m14, [pw_planar16_0]
> -    mova            m15, [pw_planar8_0]
> +    mova            m12, [pw_planar32_mul]
> +    mova            m13, [pw_planar32_mul + mmsize]
> +    mova            m14, [pw_planar16_mul]
> +    mova            m15, [pw_planar16_mul + mmsize]
>      add             r1, r1
>  
>  %macro PROCESS 1
> @@ -1596,7 +1589,7 @@
>      pshufd          m4, m4, 0xAA
>  
>      pmullw          m3, [multi_2Row]        ; (x + 1) * topRight
> -    pmullw          m0, m1, [pw_planar4_1]  ; (blkSize - 1 - y) * above[x]
> +    pmullw          m0, m1, [pw_3]          ; (blkSize - 1 - y) * above[x]
>  
>      paddw           m3, [pw_4]
>      paddw           m3, m4
> @@ -1934,7 +1927,7 @@
>      pshufd          m4, m4, 0xAA
>  
>      pmullw          m3, [multi_2Row]        ; (x + 1) * topRight
> -    pmullw          m0, m1, [pw_planar4_1]  ; (blkSize - 1 - y) * above[x]
> +    pmullw          m0, m1, [pw_3]          ; (blkSize - 1 - y) * above[x]
>  
>      paddw           m3, [pw_4]
>      paddw           m3, m4
> @@ -1990,12 +1983,12 @@
>      pshufd          m4, m4, 0               ; v_bottomLeft
>  
>      pmullw          m3, [multiL]            ; (x + 1) * topRight
> -    pmullw          m0, m1, [pw_planar8_1]  ; (blkSize - 1 - y) * above[x]
> +    pmullw          m0, m1, [pw_7]          ; (blkSize - 1 - y) * above[x]
>      paddw           m3, [pw_8]
>      paddw           m3, m4
>      paddw           m3, m0
>      psubw           m4, m1
> -    mova            m0, [pw_planar8_0]
> +    mova            m0, [pw_planar16_mul + mmsize]
>  
>  %macro INTRA_PRED_PLANAR8 1
>  %if (%1 < 4)
> @@ -2042,8 +2035,8 @@
>  
>      pmullw          m4, m3, [multiH]            ; (x + 1) * topRight
>      pmullw          m3, [multiL]                ; (x + 1) * topRight
> -    pmullw          m1, m2, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
> -    pmullw          m5, m7, [pw_planar16_1]     ; (blkSize - 1 - y) * above[x]
> +    pmullw          m1, m2, [pw_15]             ; (blkSize - 1 - y) * above[x]
> +    pmullw          m5, m7, [pw_15]             ; (blkSize - 1 - y) * above[x]
>      paddw           m4, [pw_16]
>      paddw           m3, [pw_16]
>      paddw           m4, m6
> @@ -2074,8 +2067,8 @@
>  %endif
>  %endif
>  %endif
> -    pmullw          m0, m5, [pw_planar8_0]
> -    pmullw          m5, [pw_planar16_0]
> +    pmullw          m0, m5, [pw_planar16_mul + mmsize]
> +    pmullw          m5, [pw_planar16_mul]
>      paddw           m0, m4
>      paddw           m5, m3
>      paddw           m3, m6
> @@ -2192,28 +2185,28 @@
>  
>      ; above[0-3] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 2]
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m0, m5
>      psubd           m5, m6, m4
>      mova            m8, m5
>  
>      ; above[4-7] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 10]
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m1, m5
>      psubd           m5, m6, m4
>      mova            m9, m5
>  
>      ; above[8-11] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 18]
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m2, m5
>      psubd           m5, m6, m4
>      mova            m10, m5
>  
>      ; above[12-15] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 26]
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m3, m5
>      psubd           m5, m6, m4
>      mova            m11, m5
> @@ -2221,7 +2214,7 @@
>      ; above[16-19] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 34]
>      mova            m7, m12
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m7, m5
>      mova            m12, m7
>      psubd           m5, m6, m4
> @@ -2230,7 +2223,7 @@
>      ; above[20-23] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 42]
>      mova            m7, m13
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m7, m5
>      mova            m13, m7
>      psubd           m5, m6, m4
> @@ -2239,7 +2232,7 @@
>      ; above[24-27] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 50]
>      mova            m7, m14
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m7, m5
>      mova            m14, m7
>      psubd           m5, m6, m4
> @@ -2248,7 +2241,7 @@
>      ; above[28-31] * (blkSize - 1 - y)
>      pmovzxwd        m4, [r2 + 58]
>      mova            m7, m15
> -    pmulld          m5, m4, [pd_planar32_1]
> +    pmulld          m5, m4, [pd_31]
>      paddd           m7, m5
>      mova            m15, m7
>      psubd           m5, m6, m4
> @@ -3766,33 +3759,33 @@
>      RET
>  
>  cglobal intra_pred_ang8_10, 3,6,3
> -    movu        m1,             [r2 + 34]           ; [8 7 6 5 4 3 2 1]
> -    pshufb      m0,             m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
> +    movu        m1,             [r2 + 34]    ; [8 7 6 5 4 3 2 1]
> +    pshufb      m0,             m1, [pb_01]  ; [1 1 1 1 1 1 1 1]
>      add         r1,             r1
>      lea         r3,             [r1 * 3]
>  
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
> +    pshufb      m2,             m1, [pb_01]  ; [2 2 2 2 2 2 2 2]
>      movu        [r0 + r1],      m2
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
> +    pshufb      m2,             m1, [pb_01]  ; [3 3 3 3 3 3 3 3]
>      movu        [r0 + r1 * 2],  m2
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
> +    pshufb      m2,             m1, [pb_01]  ; [4 4 4 4 4 4 4 4]
>      movu        [r0 + r3],      m2
>  
>      lea         r5,             [r0 + r1 *4]
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
> +    pshufb      m2,             m1, [pb_01]  ; [5 5 5 5 5 5 5 5]
>      movu        [r5],           m2
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
> +    pshufb      m2,             m1, [pb_01]  ; [6 6 6 6 6 6 6 6]
>      movu        [r5 + r1],      m2
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
> +    pshufb      m2,             m1, [pb_01]  ; [7 7 7 7 7 7 7 7]
>      movu        [r5 + r1 * 2],  m2
>      psrldq      m1,             2
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
> +    pshufb      m2,             m1, [pb_01]  ; [8 8 8 8 8 8 8 8]
>      movu        [r5 + r3],      m2
>  
>      cmp         r4m,            byte 0
> @@ -3801,7 +3794,7 @@
>      ; filter
>  
>      movh        m1,             [r2]                ; [3 2 1 0]
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
> +    pshufb      m2,             m1, [pb_01]  ; [0 0 0 0 0 0 0 0]
>      movu        m1,             [r2 + 2]            ; [8 7 6 5 4 3 2 1]
>      psubw       m1,             m2
>      psraw       m1,             1
> @@ -5671,9 +5664,9 @@
>      jz         .quit
>  
>      ; filter
> -    pshufb      m0,             [pw_unpackwdq]
> +    pshufb      m0,             [pb_01]
>      pinsrw      m1,             [r2], 0             ; [3 2 1 0]
> -    pshufb      m2,             m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
> +    pshufb      m2,             m1, [pb_01]         ; [0 0 0 0 0 0 0 0]
>      movu        m1,             [r2 + 2 + 32]       ; [8 7 6 5 4 3 2 1]
>      psubw       m1,             m2
>      psraw       m1,             1
> @@ -10006,73 +9999,73 @@
>      mov         r5d,                    r4m
>      movu        m1,                     [r2 + 2 + 64]       ; [8 7 6 5 4 3 2 1]
>      movu        m3,                     [r2 + 18 + 64]      ; [16 15 14 13 12 11 10 9]
> -    pshufb      m0,                     m1, [pw_unpackwdq]  ; [1 1 1 1 1 1 1 1]
> +    pshufb      m0,                     m1, [pb_01]  ; [1 1 1 1 1 1 1 1]
>      add         r1,                     r1
>      lea         r4,                     [r1 * 3]
>  
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [2 2 2 2 2 2 2 2]
> +    pshufb      m2,                     m1, [pb_01]  ; [2 2 2 2 2 2 2 2]
>      movu        [r0 + r1],              m2
>      movu        [r0 + r1 + 16],         m2
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [3 3 3 3 3 3 3 3]
> +    pshufb      m2,                     m1, [pb_01]  ; [3 3 3 3 3 3 3 3]
>      movu        [r0 + r1 * 2],          m2
>      movu        [r0 + r1 * 2 + 16],     m2
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [4 4 4 4 4 4 4 4]
> +    pshufb      m2,                     m1, [pb_01]  ; [4 4 4 4 4 4 4 4]
>      movu        [r0 + r4],              m2
>      movu        [r0 + r4 + 16],         m2
>  
>      lea         r3,                     [r0 + r1 *4]
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [5 5 5 5 5 5 5 5]
> +    pshufb      m2,                     m1, [pb_01]  ; [5 5 5 5 5 5 5 5]
>      movu        [r3],                   m2
>      movu        [r3 + 16],              m2
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [6 6 6 6 6 6 6 6]
> +    pshufb      m2,                     m1, [pb_01]  ; [6 6 6 6 6 6 6 6]
>      movu        [r3 + r1],              m2
>      movu        [r3 + r1 + 16],         m2
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [7 7 7 7 7 7 7 7]
> +    pshufb      m2,                     m1, [pb_01]  ; [7 7 7 7 7 7 7 7]
>      movu        [r3 + r1 * 2],          m2
>      movu        [r3 + r1 * 2 + 16],     m2
>      psrldq      m1,                     2
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [8 8 8 8 8 8 8 8]
> +    pshufb      m2,                     m1, [pb_01]  ; [8 8 8 8 8 8 8 8]
>      movu        [r3 + r4],              m2
>      movu        [r3 + r4 + 16],         m2
>  
>      lea         r3,                     [r3 + r1 *4]
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [9 9 9 9 9 9 9 9]
> +    pshufb      m2,                     m3, [pb_01]  ; [9 9 9 9 9 9 9 9]
>      movu        [r3],                   m2
>      movu        [r3 + 16],              m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [10 10 10 10 10 10 10 10]
> +    pshufb      m2,                     m3, [pb_01]  ; [10 10 10 10 10 10 10 10]
>      movu        [r3 + r1],              m2
>      movu        [r3 + r1 + 16],         m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [11 11 11 11 11 11 11 11]
> +    pshufb      m2,                     m3, [pb_01]  ; [11 11 11 11 11 11 11 11]
>      movu        [r3 + r1 * 2],          m2
>      movu        [r3 + r1 * 2 + 16],     m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [12 12 12 12 12 12 12 12]
> +    pshufb      m2,                     m3, [pb_01]  ; [12 12 12 12 12 12 12 12]
>      movu        [r3 + r4],              m2
>      movu        [r3 + r4 + 16],         m2
>  
>      lea         r3,                     [r3 + r1 *4]
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [13 13 13 13 13 13 13 13]
> +    pshufb      m2,                     m3, [pb_01]  ; [13 13 13 13 13 13 13 13]
>      movu        [r3],                   m2
>      movu        [r3 + 16],              m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [14 14 14 14 14 14 14 14]
> +    pshufb      m2,                     m3, [pb_01]  ; [14 14 14 14 14 14 14 14]
>      movu        [r3 + r1],              m2
>      movu        [r3 + r1 + 16],         m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [15 15 15 15 15 15 15 15]
> +    pshufb      m2,                     m3, [pb_01]  ; [15 15 15 15 15 15 15 15]
>      movu        [r3 + r1 * 2],          m2
>      movu        [r3 + r1 * 2 + 16],     m2
>      psrldq      m3,                     2
> -    pshufb      m2,                     m3, [pw_unpackwdq]  ; [16 16 16 16 16 16 16 16]
> +    pshufb      m2,                     m3, [pb_01]  ; [16 16 16 16 16 16 16 16]
>      movu        [r3 + r4],              m2
>      movu        [r3 + r4 + 16],         m2
>      mova        m3,                     m0
> @@ -10082,7 +10075,7 @@
>  
>      ; filter
>      pinsrw      m1,                     [r2], 0             ; [3 2 1 0]
> -    pshufb      m2,                     m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
> +    pshufb      m2,                     m1, [pb_01]  ; [0 0 0 0 0 0 0 0]
>      movu        m1,                     [r2 + 2]            ; [8 7 6 5 4 3 2 1]
>      movu        m3,                     [r2 + 18]           ; [16 15 14 13 12 11 10 9]
>      psubw       m1,                     m2
> @@ -10152,9 +10145,9 @@
>  
>      ; filter
>  
> -    pshufb      m0,                 [pw_unpackwdq]
> +    pshufb      m0,                 [pb_01]
>      pinsrw      m1,                 [r2], 0             ; [3 2 1 0]
> -    pshufb      m2,                 m1, [pw_unpackwdq]  ; [0 0 0 0 0 0 0 0]
> +    pshufb      m2,                 m1, [pb_01]         ; [0 0 0 0 0 0 0 0]
>      movu        m1,                 [r2 + 2 + 64]       ; [8 7 6 5 4 3 2 1]
>      movu        m3,                 [r2 + 18 + 64]      ; [16 15 14 13 12 11 10 9]
>      psubw       m1,                 m2
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list