[x265] [PATCH Review Only] asm code for intra_allangs4x4 [all 34 modes]

Praveen Tiwari praveen at multicorewareinc.com
Tue Dec 3 07:17:33 CET 2013


Row values are also reused (same row only different Frac), perhaps this is
what you want to refer.

Regards,
Praveen Tiwari


On Tue, Dec 3, 2013 at 11:17 AM, Praveen Tiwari <
praveen at multicorewareinc.com> wrote:

> I have given preference to direct results in-spite of half register
> because I think it can give more performance like mode 6 [row 1, row 3],
> mode 22 [row 1], mode 33 [row 0, row 1] are not calculated they are
> directly stored (just movd is enough).
>
> Regards,
> Praveen Tiwari
>
>
> On Mon, Dec 2, 2013 at 8:30 PM, chen <chenm003 at 163.com> wrote:
>
>> Code is right,
>> there have some reduce register copy operator, the biggest problem is
>> wasting half of register, eg: mode 4 and mode 32 can generate in same
>> register.
>>
>> At 2013-12-02 22:47:06,praveen at multicorewareinc.com wrote:
>>
>> ># HG changeset patch
>> ># User Praveen Tiwari
>> ># Date 1385995608 -19800
>> ># Node ID 1fccf1a770233907fefb0f5a47ed4e7c17223d4a
>> ># Parent  df0b4f81609e611989c5b1743e7729adeb51cb01
>> >asm code for intra_allangs4x4 [all 34 modes]
>> >
>> >diff -r df0b4f81609e -r 1fccf1a77023 source/common/x86/allangspred.asm
>> >--- /dev/null Thu Jan 01 00:00:00 1970 +0000
>> >+++ b/source/common/x86/allangspred.asm Mon Dec 02 20:16:48 2013 +0530
>> >@@ -0,0 +1,920 @@
>>
>> >+;*****************************************************************************
>> >+;* Copyright (C) 2013 x265 project
>> >+;*
>> >+;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
>> >+;*
>> >+;* This program is free software; you can redistribute it and/or modify
>> >+;* it under the terms of the GNU General Public License as published by
>> >+;* the Free Software Foundation; either version 2 of the License, or
>> >+;* (at your option) any later version.
>> >+;*
>> >+;* This program is distributed in the hope that it will be useful,
>> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
>> >+;* GNU General Public License for more details.
>> >+;*
>> >+;* You should have received a copy of the GNU General Public License
>> >+;* along with this program; if not, write to the Free Software
>>
>> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
>> >+;*
>>
>> >+;* This program is also available under a commercial proprietary license.
>> >+;* For more information, contact us at licensing at multicorewareinc.com.
>>
>> >+;*****************************************************************************/
>> >+
>> >+%include "x86inc.asm"
>> >+
>> >+SECTION_RODATA 32
>> >+
>> >+tab_6_26:  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0
>>
>> >+tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>>
>> >+tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
>> >+tab_24_8:  db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>>
>> >+tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
>>
>> >+tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
>> >+tab_1_31:  db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
>>
>> >+tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
>> >+tab_30_2:  db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
>>
>> >+tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
>> >+tab_28_4:  db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
>>
>> >+tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
>> >+tab_25_7:  db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
>> >+tab_23_9:  db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
>>
>> >+tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
>> >+tab_5_27:  db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
>> >+tab_27_5:  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
>>
>> >+tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
>> >+tab_26_6:  db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
>> >+tab_2_30:  db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
>> >+tab_4_28:  db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
>> >+tab_8_24:  db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>>
>> >+tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>>
>> >+tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
>> >+tab_9_23:  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
>> >+tab_7_25:  db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
>>
>> >+tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
>> >+tab_31_1:  db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
>> >+
>> >+pw_1024:   dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
>> >+
>> >+tab_Si0:  db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0
>> >+tab_Si1:  db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
>> >+tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
>> >+
>> >+SECTION .text
>> >+
>>
>> >+;-----------------------------------------------------------------------------
>>
>> >+; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
>>
>> >+;-----------------------------------------------------------------------------
>> >+INIT_XMM sse4
>>
>> >+cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma
>> >+
>> >+; mode 2
>> >+
>> >+movd     m0,        [r2 + 2]
>> >+movd     m1,        [r2 + 3]
>> >+movd     m2,        [r2 + 4]
>> >+movd     m3,        [r2 + 5]
>> >+
>> >+movd     [r0],      m0
>> >+movd     [r0 + 4],  m1
>> >+movd     [r0 + 8],  m2
>> >+movd     [r0 + 12], m3
>> >+
>> >+; mode 3
>> >+
>> >+mova          m0,        [pw_1024]
>> >+
>> >+movu          m1,        [r2 + 1]
>> >+
>> >+palignr       m2,        m1,        1
>> >+punpcklbw     m1,        m2
>> >+
>> >+pmaddubsw     m7,        m1,        [tab_6_26]
>> >+pmulhrsw      m7,        m0
>> >+packuswb      m7,        m7
>> >+movd          [r0 + 16], m7
>> >+
>> >+movu          m2,        [r2 + 2]
>> >+
>> >+palignr       m3,        m2,        1
>> >+punpcklbw     m2,        m3
>> >+
>> >+pmaddubsw     m6,        m2,        [tab_12_20]
>> >+pmulhrsw      m6,        m0
>> >+packuswb      m6,        m6
>> >+movd          [r0 + 20], m6
>> >+
>> >+movu          m3,        [r2 + 3]
>> >+
>> >+palignr       m4,        m3,        1
>> >+punpcklbw     m3,        m4
>> >+
>> >+pmaddubsw     m4,        m3,        [tab_18_14]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 24], m4
>> >+
>> >+movu          m4,        [r2 + 4]
>> >+
>> >+palignr       m5,        m4,        1
>> >+punpcklbw     m4,        m5
>> >+
>> >+pmaddubsw     m4,        [tab_24_8]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 28], m4
>> >+
>> >+; mode 4
>> >+
>> >+pmaddubsw     m4,        m1,        [tab_11_21]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 32], m4
>> >+
>> >+pmaddubsw     m4,        m2,        [tab_22_10]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 36], m4
>> >+
>> >+pmaddubsw     m4,        m2,        [tab_1_31]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 40], m4
>> >+
>> >+pmaddubsw     m4,        m3,        [tab_12_20]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 44], m4
>> >+
>> >+; mode 5
>> >+
>> >+pmaddubsw     m4,        m1,        [tab_15_17]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 48], m4
>> >+
>> >+pmaddubsw     m4,        m2,        [tab_30_2]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 52], m4
>> >+
>> >+pmaddubsw     m4,        m2,        [tab_13_19]
>> >+pmulhrsw      m4,        m0
>> >+packuswb      m4,        m4
>> >+movd          [r0 + 56], m4
>> >+
>> >+pmaddubsw     m3,        [tab_28_4]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 60], m3
>> >+
>> >+; mode 6
>> >+
>> >+pmaddubsw     m3,        m1,        [tab_19_13]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 64], m3
>> >+
>> >+movd          [r0 + 68], m7
>> >+
>> >+pmaddubsw     m3,        m2,        [tab_25_7]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 72], m3
>> >+
>> >+movd          [r0 + 76], m6
>> >+
>> >+; mode 7
>> >+
>> >+pmaddubsw     m3,        m1,        [tab_23_9]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 80], m3
>> >+
>> >+pmaddubsw     m3,        m1,        [tab_14_18]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 84], m3
>> >+
>> >+pmaddubsw     m3,        m1,        [tab_5_27]
>> >+pmulhrsw      m3,        m0
>> >+packuswb      m3,        m3
>> >+movd          [r0 + 88], m3
>> >+
>> >+pmaddubsw     m2,        [tab_28_4]
>> >+pmulhrsw      m2,        m0
>> >+packuswb      m2,        m2
>> >+movd          [r0 + 92], m2
>> >+
>> >+; mode 8
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_27_5]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 96],  m2
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_22_10]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 100], m2
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_17_15]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 104], m2
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_12_20]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 108], m2
>> >+
>> >+; mode 9
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_30_2]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 112], m2
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_28_4]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 116], m2
>> >+
>> >+pmaddubsw     m2,         m1,       [tab_26_6]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 120], m2
>> >+
>> >+pmaddubsw     m1,         [tab_24_8]
>> >+pmulhrsw      m1,         m0
>> >+packuswb      m1,         m1
>> >+movd          [r0 + 124], m1
>> >+
>> >+; mode 10
>> >+
>> >+movd          m1,         [r2 + 1]
>> >+pshufd        m2,         m1,        0
>> >+movu          [r0 + 128], m2
>> >+
>> >+mova         m2,          [tab_Zero]
>> >+
>> >+pshufb       m3,          m1,       m2
>> >+punpcklbw    m3,          m2
>> >+
>> >+movd         m1,          [r1]
>> >+
>> >+pshufb       m1,          m2
>> >+punpcklbw    m1,          m2
>> >+
>> >+movd         m4,          [r1 + 1]
>> >+punpcklbw    m4,          m2
>> >+
>> >+psubw        m4,          m1
>> >+psraw        m4,          1
>> >+
>> >+paddw        m3,          m4
>> >+
>> >+packuswb     m3,          m2
>> >+
>> >+pextrb       [r0 + 128],  m3,    0
>> >+pextrb       [r0 + 132],  m3,    1
>> >+pextrb       [r0 + 136],  m3,    2
>> >+pextrb       [r0 + 140],  m3,    3
>> >+
>> >+; mode 11
>> >+
>> >+movu          m1,         [r2]
>> >+
>> >+palignr       m2,         m1,        1
>> >+punpcklbw     m1,         m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_2_30]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 144], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_4_28]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 148], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_6_26]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 152], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_8_24]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 156], m2
>> >+
>> >+; mode 12
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_5_27]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 160], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_10_22]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 164], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_15_17]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 168], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_20_12]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 172], m2
>> >+
>> >+; mode 13
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_9_23]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 176], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_18_14]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 180], m2
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_27_5]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 184], m2
>> >+
>> >+movh          m2,         [r2 - 1]
>> >+pinsrb        m2,         [r1 + 4],    0
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m2,         [tab_4_28]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 188], m2
>> >+
>> >+; mode 14
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_13_19]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 192], m2
>> >+
>> >+pmaddubsw     m5,         m1,        [tab_26_6]
>> >+pmulhrsw      m5,         m0
>> >+packuswb      m5,         m5
>> >+movd          [r0 + 196], m5
>> >+
>> >+movh          m2,         [r2 - 1]
>> >+pinsrb        m2,         [r1 + 2],    0
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_7_25]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 200], m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_20_12]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 204], m3
>> >+
>> >+; mode 15
>> >+
>> >+pmaddubsw     m3,         m1,        [tab_17_15]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 208], m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_2_30]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 212], m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_19_13]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 216], m3
>> >+
>> >+movh          m3,         [r2 - 2]
>> >+pinsrb        m3,         [r1 + 4],    0
>> >+pinsrb        m3,         [r1 + 2],    1
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_4_28]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 220], m3
>> >+
>> >+; mode 16
>> >+
>> >+pmaddubsw     m3,         m1,        [tab_21_11]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 224], m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_10_22]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 228], m3
>> >+
>> >+pmaddubsw     m3,         m2,        [tab_31_1]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 232], m3
>> >+
>> >+movh          m3,         [r2 - 2]
>> >+pinsrb        m3,         [r1 + 3],  0
>> >+pinsrb        m3,         [r1 + 2],  1
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_20_12]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 236], m3
>> >+
>> >+; mode 17
>> >+
>> >+movd          [r0 + 240], m5
>> >+
>> >+movh          m3,         [r2 - 1]
>> >+pinsrb        m3,         [r1 + 1],  0
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_20_12]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 244], m3
>> >+
>> >+movh          m3,         [r2 - 2]
>> >+pinsrb        m3,         [r1 + 2],  0
>> >+pinsrb        m3,         [r1 + 1],  1
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_14_18]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 248], m3
>> >+
>> >+movh          m3,         [r1]
>> >+pshufb        m3,         [tab_Si0]
>> >+pinsrb        m3,         [r2 + 1],   4
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_8_24]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 252], m3
>> >+
>> >+; mode 18
>> >+
>> >+movd          m3,         [r1]
>> >+movd          [r0 + 256], m3
>> >+
>> >+movh          m3,         [r1 - 1]
>> >+pinsrb        m3,         [r2 + 1],  0
>> >+movd          [r0 + 260], m3
>> >+
>> >+movh          m3,         [r1 - 2]
>> >+pinsrb        m3,         [r2 + 2],  0
>> >+pinsrb        m3,         [r2 + 1],  1
>> >+movd          [r0 + 264], m3
>> >+
>> >+movh          m3,         [r2]
>> >+pshufb        m3,         [tab_Si1]
>> >+pinsrb        m3,         [r1],       3
>> >+movd          [r0 + 268], m3
>> >+
>> >+; mode 19
>> >+
>> >+movh          m1,         [r1]
>> >+
>> >+palignr       m2,         m1,        1
>> >+punpcklbw     m1,         m2
>> >+
>> >+pmaddubsw     m5,         m1,        [tab_26_6]
>> >+pmulhrsw      m5,         m0
>> >+packuswb      m5,         m5
>> >+movd          [r0 + 272], m5
>> >+
>> >+movh          m2,         [r1 - 1]
>> >+pinsrb        m2,         [r2 + 1],  0
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m2,         [tab_20_12]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 276], m2
>> >+
>> >+movh          m2,         [r1 - 2]
>> >+pinsrb        m2,         [r2 + 2],  0
>> >+pinsrb        m2,         [r2 + 1],  1
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m2,         [tab_14_18]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 280], m2
>> >+
>> >+movh          m2,         [r2]
>> >+pshufb        m2,         [tab_Si0]
>> >+pinsrb        m2,         [r1 + 1],  4
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m2,         [tab_8_24]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 284], m2
>> >+
>> >+; mode 20
>> >+
>> >+pmaddubsw     m2,         m1,        [tab_21_11]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 288], m2
>> >+
>> >+movh          m2,         [r1 - 1]
>> >+pinsrb        m2,         [r2 + 2],  0
>> >+
>> >+palignr       m3,         m2,        1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_10_22]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 292], m3
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_31_1]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 296], m3
>> >+
>> >+movh          m3,         [r1 - 2]
>> >+pinsrb        m3,         [r2 + 3],  0
>> >+pinsrb        m3,         [r2 + 2],  1
>> >+
>> >+palignr       m4,         m3,        1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_20_12]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 300], m3
>> >+
>> >+; mode 21
>> >+
>> >+pmaddubsw     m3,         m1,         [tab_17_15]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 304], m3
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_2_30]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 308], m3
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_19_13]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 312], m3
>> >+
>> >+movh          m3,         [r1 - 2]
>> >+pinsrb        m3,         [r2 + 4],   0
>> >+pinsrb        m3,         [r2 + 2],   1
>> >+
>> >+palignr       m4,         m3,         1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m3,         [tab_4_28]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 316], m3
>> >+
>> >+; mode 22
>> >+
>> >+pmaddubsw     m3,         m1,         [tab_13_19]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 320], m3
>> >+
>> >+movd          [r0 + 324], m5
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_7_25]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 328], m3
>> >+
>> >+pmaddubsw     m3,         m2,         [tab_20_12]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 332], m3
>> >+
>> >+; mode 23
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_9_23]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 336], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_18_14]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 340], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_27_5]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 344], m2
>> >+
>> >+movh          m2,         [r1 - 1]
>> >+pinsrb        m2,         [r2 + 4],   0
>> >+
>> >+palignr       m3,         m2,         1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m2,         [tab_4_28]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 348], m2
>> >+
>> >+; mode 24
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_5_27]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 352], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_10_22]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 356], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_15_17]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 360], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_20_12]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 364], m2
>> >+
>> >+; mode 25
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_2_30]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 368], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_4_28]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 372], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_6_26]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 376], m2
>> >+
>> >+pmaddubsw     m2,         m1,         [tab_8_24]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 380], m2
>> >+
>> >+; mode 26
>> >+
>> >+movd          m1,         [r1 + 1]
>> >+pshufd        m2,         m1,        0
>> >+movu          [r0 + 384], m2
>> >+
>> >+mova         m2,          [tab_Zero]
>> >+
>> >+pshufb       m3,          m1,       m2
>> >+punpcklbw    m3,          m2
>> >+
>> >+movd         m1,          [r2]
>> >+
>> >+pshufb       m1,          m2
>> >+punpcklbw    m1,          m2
>> >+
>> >+movd         m4,          [r2 + 1]
>> >+punpcklbw    m4,          m2
>> >+
>> >+psubw        m4,          m1
>> >+psraw        m4,          1
>> >+
>> >+paddw        m3,          m4
>> >+
>> >+packuswb     m3,          m2
>> >+
>> >+pextrb       [r0 + 384],  m3,    0
>> >+pextrb       [r0 + 388],  m3,    1
>> >+pextrb       [r0 + 392],  m3,    2
>> >+pextrb       [r0 + 396],  m3,    3
>> >+
>> >+; mode 27
>> >+
>> >+movh          m1,         [r1 + 1]
>> >+
>> >+palignr       m2,         m1,     1
>> >+punpcklbw     m1,         m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_30_2]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 400], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_28_4]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 404], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_26_6]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 408], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_24_8]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 412], m2
>> >+
>> >+; mode 28
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_27_5]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 416], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_22_10]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 420], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_17_15]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 424], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_12_20]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 428], m2
>> >+
>> >+; mode 29
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_23_9]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 432], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_14_18]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 436], m2
>> >+
>> >+pmaddubsw     m2,         m1,     [tab_5_27]
>> >+pmulhrsw      m2,         m0
>> >+packuswb      m2,         m2
>> >+movd          [r0 + 440], m2
>> >+
>> >+movh          m2,         [r1 + 2]
>> >+
>> >+palignr       m3,         m2,     1
>> >+punpcklbw     m2,         m3
>> >+
>> >+pmaddubsw     m3,         m2,     [tab_28_4]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 444], m3
>> >+
>> >+; mode 30
>> >+
>> >+pmaddubsw     m3,         m1,     [tab_19_13]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 448], m3
>> >+
>> >+pmaddubsw     m6,         m1,     [tab_6_26]
>> >+pmulhrsw      m6,         m0
>> >+packuswb      m6,         m6
>> >+movd          [r0 + 452], m6
>> >+
>> >+pmaddubsw     m3,         m2,     [tab_25_7]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 456], m3
>> >+
>> >+pmaddubsw     m5,         m2,     [tab_12_20]
>> >+pmulhrsw      m5,         m0
>> >+packuswb      m5,         m5
>> >+movd          [r0 + 460], m5
>> >+
>> >+; mode 31
>> >+
>> >+pmaddubsw     m3,         m1,     [tab_15_17]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 464], m3
>> >+
>> >+pmaddubsw     m3,         m2,     [tab_30_2]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 468], m3
>> >+
>> >+pmaddubsw     m3,         m2,     [tab_13_19]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 472], m3
>> >+
>> >+movh          m3,         [r1 + 3]
>> >+
>> >+palignr       m4,         m3,     1
>> >+punpcklbw     m3,         m4
>> >+
>> >+pmaddubsw     m4,         m3,     [tab_28_4]
>> >+pmulhrsw      m4,         m0
>> >+packuswb      m4,         m4
>> >+movd          [r0 + 476], m4
>> >+
>> >+; mode 32
>> >+
>> >+pmaddubsw     m4,         m1,     [tab_11_21]
>> >+pmulhrsw      m4,         m0
>> >+packuswb      m4,         m4
>> >+movd          [r0 + 480], m4
>> >+
>> >+pmaddubsw     m4,         m2,     [tab_22_10]
>> >+pmulhrsw      m4,         m0
>> >+packuswb      m4,         m4
>> >+movd          [r0 + 484], m4
>> >+
>> >+pmaddubsw     m4,         m2,     [tab_1_31]
>> >+pmulhrsw      m4,         m0
>> >+packuswb      m4,         m4
>> >+movd          [r0 + 488], m4
>> >+
>> >+pmaddubsw     m4,         m3,     [tab_12_20]
>> >+pmulhrsw      m4,         m0
>> >+packuswb      m4,         m4
>> >+movd          [r0 + 492], m4
>> >+
>> >+; mode 33
>> >+
>> >+movd          [r0 + 496], m6
>> >+
>> >+movd          [r0 + 500], m5
>> >+
>> >+pmaddubsw     m3,         [tab_18_14]
>> >+pmulhrsw      m3,         m0
>> >+packuswb      m3,         m3
>> >+movd          [r0 + 504], m3
>> >+
>> >+movh          m1,         [r1 + 4]
>> >+
>> >+palignr       m2,         m1,     1
>> >+punpcklbw     m1,         m2
>> >+
>> >+pmaddubsw     m1,         [tab_24_8]
>> >+pmulhrsw      m1,         m0
>> >+packuswb      m1,         m1
>> >+movd          [r0 + 508], m1
>> >+
>> >+; mode 34
>> >+
>> >+movd         m0,          [r1 + 2]
>> >+movd         [r0 + 512],  m0
>> >+
>> >+movd         m0,          [r1 + 3]
>> >+movd         [r0 + 516],  m0
>> >+
>> >+movd         m0,          [r1 + 4]
>> >+movd         [r0 + 520],  m0
>> >+
>> >+movd         m0,          [r1 + 5]
>> >+movd         [r0 + 524],  m0
>> >+
>> >+RET
>> >_______________________________________________
>> >x265-devel mailing list
>> >x265-devel at videolan.org
>> >https://mailman.videolan.org/listinfo/x265-devel
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/2c93b095/attachment-0001.html>


More information about the x265-devel mailing list