[x265] [PATCH] asm: removed some duplicate constants and moved others into const-a.asm
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Wed May 20 07:16:56 CEST 2015
Ok. I will resend this patch on latest tip.
On Tue, May 19, 2015 at 8:47 PM, Steve Borho <steve at borho.org> wrote:
> On 05/19, dnyaneshwar at multicorewareinc.com wrote:
> > # HG changeset patch
> > # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> > # Date 1432028888 -19800
> > # Tue May 19 15:18:08 2015 +0530
> > # Node ID b44cdf8dc08c77e84b8707992cd0006bbf23d864
> > # Parent ac32faec79be9c6a60d267086b4563bd884537c0
> > asm: removed some duplicate constants and moved others into const-a.asm
>
> looks fine, but is not applying on the tip
>
> > diff -r ac32faec79be -r b44cdf8dc08c source/common/x86/const-a.asm
> > --- a/source/common/x86/const-a.asm Mon May 18 18:03:19 2015 +0530
> > +++ b/source/common/x86/const-a.asm Tue May 19 15:18:08 2015 +0530
> > @@ -63,6 +63,8 @@
> >
> > const pw_1, times 16 dw 1
> > const pw_2, times 16 dw 2
> > +const pw_3, times 16 dw 3
> > +const pw_7, times 16 dw 7
> > const pw_m2, times 8 dw -2
> > const pw_4, times 8 dw 4
> > const pw_8, times 8 dw 8
> > @@ -110,6 +112,7 @@
> > const pd_4, times 4 dd 4
> > const pd_8, times 4 dd 8
> > const pd_16, times 4 dd 16
> > +const pd_31, times 4 dd 31
> > const pd_32, times 4 dd 32
> > const pd_64, times 4 dd 64
> > const pd_128, times 4 dd 128
> > diff -r ac32faec79be -r b44cdf8dc08c source/common/x86/intrapred16.asm
> > --- a/source/common/x86/intrapred16.asm Mon May 18 18:03:19 2015
> +0530
> > +++ b/source/common/x86/intrapred16.asm Tue May 19 15:18:08 2015
> +0530
> > @@ -44,7 +44,6 @@
> > const pw_punpcklwd, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5,
> 6, 7, 6, 7, 8, 9
> > const c_mode32_10_0, db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
> 0, 1, 0, 1, 0, 1
> >
> > -const pw_unpackwdq, times 8 db 0,1
> > const pw_ang8_12, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
> > const pw_ang8_13, db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
> > const pw_ang8_14, db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0,
> 1
> > @@ -58,16 +57,6 @@
> >
> > ;; (blkSize - 1 - x)
> > pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
> > -pw_planar4_1: dw 3, 3, 3, 3, 3, 3, 3, 3
> > -pw_planar8_0: dw 7, 6, 5, 4, 3, 2, 1, 0
> > -pw_planar8_1: dw 7, 7, 7, 7, 7, 7, 7, 7
> > -pw_planar16_0: dw 15, 14, 13, 12, 11, 10, 9, 8
> > -pw_planar16_1: dw 15, 15, 15, 15, 15, 15, 15, 15
> > -pd_planar32_1: dd 31, 31, 31, 31
> > -
> > -pw_planar32_1: dw 31, 31, 31, 31, 31, 31, 31, 31
> > -pw_planar32_L: dw 31, 30, 29, 28, 27, 26, 25, 24
> > -pw_planar32_H: dw 23, 22, 21, 20, 19, 18, 17, 16
> >
> > const planar32_table
> > %assign x 31
> > @@ -85,8 +74,11 @@
> >
> > SECTION .text
> >
> > +cextern pb_01
> > cextern pw_1
> > cextern pw_2
> > +cextern pw_3
> > +cextern pw_7
> > cextern pw_4
> > cextern pw_8
> > cextern pw_15
> > @@ -95,6 +87,7 @@
> > cextern pw_32
> > cextern pw_1023
> > cextern pd_16
> > +cextern pd_31
> > cextern pd_32
> > cextern pw_4096
> > cextern multiL
> > @@ -681,7 +674,7 @@
> > pshufd m4, m4, 0 ; v_bottomLeft
> >
> > pmullw m3, [multiL] ; (x + 1) * topRight
> > - pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) *
> above[x]
> > paddw m3, [pw_8]
> > paddw m3, m4
> > paddw m3, m0
> > @@ -695,7 +688,7 @@
> > pshufhw m1, m2, 0x55 * (%1 - 4)
> > pshufd m1, m1, 0xAA
> > %endif
> > - pmullw m1, [pw_planar8_0]
> > + pmullw m1, [pw_planar16_mul + mmsize]
> > paddw m1, m3
> > psraw m1, 4
> > movu [r0], m1
> > @@ -733,8 +726,8 @@
> >
> > pmullw m4, m3, [multiH] ; (x + 1) * topRight
> > pmullw m3, [multiL] ; (x + 1) * topRight
> > - pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) *
> above[x]
> > - pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) *
> above[x]
> > paddw m4, [pw_16]
> > paddw m3, [pw_16]
> > paddw m4, m6
> > @@ -770,8 +763,8 @@
> > paddw m4, m1
> > lea r0, [r0 + r1 * 2]
> > %endif
> > - pmullw m0, m5, [pw_planar8_0]
> > - pmullw m5, [pw_planar16_0]
> > + pmullw m0, m5, [pw_planar16_mul + mmsize]
> > + pmullw m5, [pw_planar16_mul]
> > paddw m0, m4
> > paddw m5, m3
> > psraw m5, 5
> > @@ -827,7 +820,7 @@
> > mova m9, m6
> > mova m10, m6
> >
> > - mova m12, [pw_planar32_1]
> > + mova m12, [pw_31]
> > movu m4, [r2 + 2]
> > psubw m8, m4
> > pmullw m4, m12
> > @@ -848,10 +841,10 @@
> > pmullw m5, m12
> > paddw m3, m5
> >
> > - mova m12, [pw_planar32_L]
> > - mova m13, [pw_planar32_H]
> > - mova m14, [pw_planar16_0]
> > - mova m15, [pw_planar8_0]
> > + mova m12, [pw_planar32_mul]
> > + mova m13, [pw_planar32_mul + mmsize]
> > + mova m14, [pw_planar16_mul]
> > + mova m15, [pw_planar16_mul + mmsize]
> > add r1, r1
> >
> > %macro PROCESS 1
> > @@ -1596,7 +1589,7 @@
> > pshufd m4, m4, 0xAA
> >
> > pmullw m3, [multi_2Row] ; (x + 1) * topRight
> > - pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) *
> above[x]
> >
> > paddw m3, [pw_4]
> > paddw m3, m4
> > @@ -1934,7 +1927,7 @@
> > pshufd m4, m4, 0xAA
> >
> > pmullw m3, [multi_2Row] ; (x + 1) * topRight
> > - pmullw m0, m1, [pw_planar4_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m0, m1, [pw_3] ; (blkSize - 1 - y) *
> above[x]
> >
> > paddw m3, [pw_4]
> > paddw m3, m4
> > @@ -1990,12 +1983,12 @@
> > pshufd m4, m4, 0 ; v_bottomLeft
> >
> > pmullw m3, [multiL] ; (x + 1) * topRight
> > - pmullw m0, m1, [pw_planar8_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m0, m1, [pw_7] ; (blkSize - 1 - y) *
> above[x]
> > paddw m3, [pw_8]
> > paddw m3, m4
> > paddw m3, m0
> > psubw m4, m1
> > - mova m0, [pw_planar8_0]
> > + mova m0, [pw_planar16_mul + mmsize]
> >
> > %macro INTRA_PRED_PLANAR8 1
> > %if (%1 < 4)
> > @@ -2042,8 +2035,8 @@
> >
> > pmullw m4, m3, [multiH] ; (x + 1) * topRight
> > pmullw m3, [multiL] ; (x + 1) * topRight
> > - pmullw m1, m2, [pw_planar16_1] ; (blkSize - 1 - y) *
> above[x]
> > - pmullw m5, m7, [pw_planar16_1] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m1, m2, [pw_15] ; (blkSize - 1 - y) *
> above[x]
> > + pmullw m5, m7, [pw_15] ; (blkSize - 1 - y) *
> above[x]
> > paddw m4, [pw_16]
> > paddw m3, [pw_16]
> > paddw m4, m6
> > @@ -2074,8 +2067,8 @@
> > %endif
> > %endif
> > %endif
> > - pmullw m0, m5, [pw_planar8_0]
> > - pmullw m5, [pw_planar16_0]
> > + pmullw m0, m5, [pw_planar16_mul + mmsize]
> > + pmullw m5, [pw_planar16_mul]
> > paddw m0, m4
> > paddw m5, m3
> > paddw m3, m6
> > @@ -2192,28 +2185,28 @@
> >
> > ; above[0-3] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 2]
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m0, m5
> > psubd m5, m6, m4
> > mova m8, m5
> >
> > ; above[4-7] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 10]
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m1, m5
> > psubd m5, m6, m4
> > mova m9, m5
> >
> > ; above[8-11] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 18]
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m2, m5
> > psubd m5, m6, m4
> > mova m10, m5
> >
> > ; above[12-15] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 26]
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m3, m5
> > psubd m5, m6, m4
> > mova m11, m5
> > @@ -2221,7 +2214,7 @@
> > ; above[16-19] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 34]
> > mova m7, m12
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m7, m5
> > mova m12, m7
> > psubd m5, m6, m4
> > @@ -2230,7 +2223,7 @@
> > ; above[20-23] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 42]
> > mova m7, m13
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m7, m5
> > mova m13, m7
> > psubd m5, m6, m4
> > @@ -2239,7 +2232,7 @@
> > ; above[24-27] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 50]
> > mova m7, m14
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m7, m5
> > mova m14, m7
> > psubd m5, m6, m4
> > @@ -2248,7 +2241,7 @@
> > ; above[28-31] * (blkSize - 1 - y)
> > pmovzxwd m4, [r2 + 58]
> > mova m7, m15
> > - pmulld m5, m4, [pd_planar32_1]
> > + pmulld m5, m4, [pd_31]
> > paddd m7, m5
> > mova m15, m7
> > psubd m5, m6, m4
> > @@ -3766,33 +3759,33 @@
> > RET
> >
> > cglobal intra_pred_ang8_10, 3,6,3
> > - movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
> > - pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
> > + movu m1, [r2 + 34] ; [8 7 6 5 4 3 2 1]
> > + pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
> > add r1, r1
> > lea r3, [r1 * 3]
> >
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
> > + pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
> > movu [r0 + r1], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
> > + pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
> > movu [r0 + r1 * 2], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
> > + pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
> > movu [r0 + r3], m2
> >
> > lea r5, [r0 + r1 *4]
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
> > + pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
> > movu [r5], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
> > + pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
> > movu [r5 + r1], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
> > + pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
> > movu [r5 + r1 * 2], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
> > + pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
> > movu [r5 + r3], m2
> >
> > cmp r4m, byte 0
> > @@ -3801,7 +3794,7 @@
> > ; filter
> >
> > movh m1, [r2] ; [3 2 1 0]
> > - pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
> > + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
> > movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
> > psubw m1, m2
> > psraw m1, 1
> > @@ -5671,9 +5664,9 @@
> > jz .quit
> >
> > ; filter
> > - pshufb m0, [pw_unpackwdq]
> > + pshufb m0, [pb_01]
> > pinsrw m1, [r2], 0 ; [3 2 1 0]
> > - pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
> > + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
> > movu m1, [r2 + 2 + 32] ; [8 7 6 5 4 3 2 1]
> > psubw m1, m2
> > psraw m1, 1
> > @@ -10006,73 +9999,73 @@
> > mov r5d, r4m
> > movu m1, [r2 + 2 + 64] ; [8 7 6 5
> 4 3 2 1]
> > movu m3, [r2 + 18 + 64] ; [16 15 14
> 13 12 11 10 9]
> > - pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1
> 1 1 1 1]
> > + pshufb m0, m1, [pb_01] ; [1 1 1 1 1 1 1 1]
> > add r1, r1
> > lea r4, [r1 * 3]
> >
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2
> 2 2 2 2]
> > + pshufb m2, m1, [pb_01] ; [2 2 2 2 2 2 2 2]
> > movu [r0 + r1], m2
> > movu [r0 + r1 + 16], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3
> 3 3 3 3]
> > + pshufb m2, m1, [pb_01] ; [3 3 3 3 3 3 3 3]
> > movu [r0 + r1 * 2], m2
> > movu [r0 + r1 * 2 + 16], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4
> 4 4 4 4]
> > + pshufb m2, m1, [pb_01] ; [4 4 4 4 4 4 4 4]
> > movu [r0 + r4], m2
> > movu [r0 + r4 + 16], m2
> >
> > lea r3, [r0 + r1 *4]
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5
> 5 5 5 5]
> > + pshufb m2, m1, [pb_01] ; [5 5 5 5 5 5 5 5]
> > movu [r3], m2
> > movu [r3 + 16], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6
> 6 6 6 6]
> > + pshufb m2, m1, [pb_01] ; [6 6 6 6 6 6 6 6]
> > movu [r3 + r1], m2
> > movu [r3 + r1 + 16], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7
> 7 7 7 7]
> > + pshufb m2, m1, [pb_01] ; [7 7 7 7 7 7 7 7]
> > movu [r3 + r1 * 2], m2
> > movu [r3 + r1 * 2 + 16], m2
> > psrldq m1, 2
> > - pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8
> 8 8 8 8]
> > + pshufb m2, m1, [pb_01] ; [8 8 8 8 8 8 8 8]
> > movu [r3 + r4], m2
> > movu [r3 + r4 + 16], m2
> >
> > lea r3, [r3 + r1 *4]
> > - pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9
> 9 9 9 9]
> > + pshufb m2, m3, [pb_01] ; [9 9 9 9 9 9 9 9]
> > movu [r3], m2
> > movu [r3 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [10 10 10
> 10 10 10 10 10]
> > + pshufb m2, m3, [pb_01] ; [10 10 10 10 10
> 10 10 10]
> > movu [r3 + r1], m2
> > movu [r3 + r1 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [11 11 11
> 11 11 11 11 11]
> > + pshufb m2, m3, [pb_01] ; [11 11 11 11 11
> 11 11 11]
> > movu [r3 + r1 * 2], m2
> > movu [r3 + r1 * 2 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [12 12 12
> 12 12 12 12 12]
> > + pshufb m2, m3, [pb_01] ; [12 12 12 12 12
> 12 12 12]
> > movu [r3 + r4], m2
> > movu [r3 + r4 + 16], m2
> >
> > lea r3, [r3 + r1 *4]
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [13 13 13
> 13 13 13 13 13]
> > + pshufb m2, m3, [pb_01] ; [13 13 13 13 13
> 13 13 13]
> > movu [r3], m2
> > movu [r3 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [14 14 14
> 14 14 14 14 14]
> > + pshufb m2, m3, [pb_01] ; [14 14 14 14 14
> 14 14 14]
> > movu [r3 + r1], m2
> > movu [r3 + r1 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [15 15 15
> 15 15 15 15 15]
> > + pshufb m2, m3, [pb_01] ; [15 15 15 15 15
> 15 15 15]
> > movu [r3 + r1 * 2], m2
> > movu [r3 + r1 * 2 + 16], m2
> > psrldq m3, 2
> > - pshufb m2, m3, [pw_unpackwdq] ; [16 16 16
> 16 16 16 16 16]
> > + pshufb m2, m3, [pb_01] ; [16 16 16 16 16
> 16 16 16]
> > movu [r3 + r4], m2
> > movu [r3 + r4 + 16], m2
> > mova m3, m0
> > @@ -10082,7 +10075,7 @@
> >
> > ; filter
> > pinsrw m1, [r2], 0 ; [3 2 1 0]
> > - pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0
> 0 0 0 0]
> > + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0 0 0]
> > movu m1, [r2 + 2] ; [8 7 6 5
> 4 3 2 1]
> > movu m3, [r2 + 18] ; [16 15 14
> 13 12 11 10 9]
> > psubw m1, m2
> > @@ -10152,9 +10145,9 @@
> >
> > ; filter
> >
> > - pshufb m0, [pw_unpackwdq]
> > + pshufb m0, [pb_01]
> > pinsrw m1, [r2], 0 ; [3 2 1 0]
> > - pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0
> 0 0]
> > + pshufb m2, m1, [pb_01] ; [0 0 0 0 0 0
> 0 0]
> > movu m1, [r2 + 2 + 64] ; [8 7 6 5 4 3
> 2 1]
> > movu m3, [r2 + 18 + 64] ; [16 15 14 13
> 12 11 10 9]
> > psubw m1, m2
> > _______________________________________________
> > x265-devel mailing list
> > x265-devel at videolan.org
> > https://mailman.videolan.org/listinfo/x265-devel
>
> --
> Steve Borho
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150520/9e756525/attachment-0001.html>
More information about the x265-devel
mailing list