[x265] [PATCH Review Only] asm code for intra_allangs4x4 [all 34 modes]
Praveen Tiwari
praveen at multicorewareinc.com
Tue Dec 3 06:47:27 CET 2013
I have given preference to direct results in-spite of half register because
I think it can give more performance like mode 6 [row 1, row 3], mode 22
[row 1], mode 33 [row 0, row 1] are not calculated they are directly stored
(just movd is enough).
Regards,
Praveen Tiwari
On Mon, Dec 2, 2013 at 8:30 PM, chen <chenm003 at 163.com> wrote:
> Code is right,
> there have some reduce register copy operator, the biggest problem is
> wasting half of register, eg: mode 4 and mode 32 can generate in same
> register.
>
> At 2013-12-02 22:47:06,praveen at multicorewareinc.com wrote:
>
> ># HG changeset patch
> ># User Praveen Tiwari
> ># Date 1385995608 -19800
> ># Node ID 1fccf1a770233907fefb0f5a47ed4e7c17223d4a
> ># Parent df0b4f81609e611989c5b1743e7729adeb51cb01
> >asm code for intra_allangs4x4 [all 34 modes]
> >
> >diff -r df0b4f81609e -r 1fccf1a77023 source/common/x86/allangspred.asm
> >--- /dev/null Thu Jan 01 00:00:00 1970 +0000
> >+++ b/source/common/x86/allangspred.asm Mon Dec 02 20:16:48 2013 +0530
> >@@ -0,0 +1,920 @@
>
> >+;*****************************************************************************
> >+;* Copyright (C) 2013 x265 project
> >+;*
> >+;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> >+;*
> >+;* This program is free software; you can redistribute it and/or modify
> >+;* it under the terms of the GNU General Public License as published by
> >+;* the Free Software Foundation; either version 2 of the License, or
> >+;* (at your option) any later version.
> >+;*
> >+;* This program is distributed in the hope that it will be useful,
> >+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> >+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> >+;* GNU General Public License for more details.
> >+;*
> >+;* You should have received a copy of the GNU General Public License
> >+;* along with this program; if not, write to the Free Software
>
> >+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
> >+;*
> >+;* This program is also available under a commercial proprietary license.
> >+;* For more information, contact us at licensing at multicorewareinc.com.
>
> >+;*****************************************************************************/
> >+
> >+%include "x86inc.asm"
> >+
> >+SECTION_RODATA 32
> >+
> >+tab_6_26: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0
>
> >+tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
>
> >+tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
> >+tab_24_8: db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
>
> >+tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
>
> >+tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
> >+tab_1_31: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
>
> >+tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
> >+tab_30_2: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
>
> >+tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
> >+tab_28_4: db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
>
> >+tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
> >+tab_25_7: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
> >+tab_23_9: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
>
> >+tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
> >+tab_5_27: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
> >+tab_27_5: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
>
> >+tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
> >+tab_26_6: db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
> >+tab_2_30: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
> >+tab_4_28: db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
> >+tab_8_24: db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
>
> >+tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
>
> >+tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
> >+tab_9_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
> >+tab_7_25: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
>
> >+tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
> >+tab_31_1: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
> >+
> >+pw_1024: dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
> >+
> >+tab_Si0: db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0
> >+tab_Si1: db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
> >+tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
> >+
> >+SECTION .text
> >+
>
> >+;-----------------------------------------------------------------------------
>
> >+; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
>
> >+;-----------------------------------------------------------------------------
> >+INIT_XMM sse4
>
> >+cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma
> >+
> >+; mode 2
> >+
> >+movd m0, [r2 + 2]
> >+movd m1, [r2 + 3]
> >+movd m2, [r2 + 4]
> >+movd m3, [r2 + 5]
> >+
> >+movd [r0], m0
> >+movd [r0 + 4], m1
> >+movd [r0 + 8], m2
> >+movd [r0 + 12], m3
> >+
> >+; mode 3
> >+
> >+mova m0, [pw_1024]
> >+
> >+movu m1, [r2 + 1]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m7, m1, [tab_6_26]
> >+pmulhrsw m7, m0
> >+packuswb m7, m7
> >+movd [r0 + 16], m7
> >+
> >+movu m2, [r2 + 2]
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m6, m2, [tab_12_20]
> >+pmulhrsw m6, m0
> >+packuswb m6, m6
> >+movd [r0 + 20], m6
> >+
> >+movu m3, [r2 + 3]
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m4, m3, [tab_18_14]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 24], m4
> >+
> >+movu m4, [r2 + 4]
> >+
> >+palignr m5, m4, 1
> >+punpcklbw m4, m5
> >+
> >+pmaddubsw m4, [tab_24_8]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 28], m4
> >+
> >+; mode 4
> >+
> >+pmaddubsw m4, m1, [tab_11_21]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 32], m4
> >+
> >+pmaddubsw m4, m2, [tab_22_10]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 36], m4
> >+
> >+pmaddubsw m4, m2, [tab_1_31]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 40], m4
> >+
> >+pmaddubsw m4, m3, [tab_12_20]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 44], m4
> >+
> >+; mode 5
> >+
> >+pmaddubsw m4, m1, [tab_15_17]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 48], m4
> >+
> >+pmaddubsw m4, m2, [tab_30_2]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 52], m4
> >+
> >+pmaddubsw m4, m2, [tab_13_19]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 56], m4
> >+
> >+pmaddubsw m3, [tab_28_4]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 60], m3
> >+
> >+; mode 6
> >+
> >+pmaddubsw m3, m1, [tab_19_13]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 64], m3
> >+
> >+movd [r0 + 68], m7
> >+
> >+pmaddubsw m3, m2, [tab_25_7]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 72], m3
> >+
> >+movd [r0 + 76], m6
> >+
> >+; mode 7
> >+
> >+pmaddubsw m3, m1, [tab_23_9]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 80], m3
> >+
> >+pmaddubsw m3, m1, [tab_14_18]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 84], m3
> >+
> >+pmaddubsw m3, m1, [tab_5_27]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 88], m3
> >+
> >+pmaddubsw m2, [tab_28_4]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 92], m2
> >+
> >+; mode 8
> >+
> >+pmaddubsw m2, m1, [tab_27_5]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 96], m2
> >+
> >+pmaddubsw m2, m1, [tab_22_10]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 100], m2
> >+
> >+pmaddubsw m2, m1, [tab_17_15]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 104], m2
> >+
> >+pmaddubsw m2, m1, [tab_12_20]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 108], m2
> >+
> >+; mode 9
> >+
> >+pmaddubsw m2, m1, [tab_30_2]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 112], m2
> >+
> >+pmaddubsw m2, m1, [tab_28_4]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 116], m2
> >+
> >+pmaddubsw m2, m1, [tab_26_6]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 120], m2
> >+
> >+pmaddubsw m1, [tab_24_8]
> >+pmulhrsw m1, m0
> >+packuswb m1, m1
> >+movd [r0 + 124], m1
> >+
> >+; mode 10
> >+
> >+movd m1, [r2 + 1]
> >+pshufd m2, m1, 0
> >+movu [r0 + 128], m2
> >+
> >+mova m2, [tab_Zero]
> >+
> >+pshufb m3, m1, m2
> >+punpcklbw m3, m2
> >+
> >+movd m1, [r1]
> >+
> >+pshufb m1, m2
> >+punpcklbw m1, m2
> >+
> >+movd m4, [r1 + 1]
> >+punpcklbw m4, m2
> >+
> >+psubw m4, m1
> >+psraw m4, 1
> >+
> >+paddw m3, m4
> >+
> >+packuswb m3, m2
> >+
> >+pextrb [r0 + 128], m3, 0
> >+pextrb [r0 + 132], m3, 1
> >+pextrb [r0 + 136], m3, 2
> >+pextrb [r0 + 140], m3, 3
> >+
> >+; mode 11
> >+
> >+movu m1, [r2]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m2, m1, [tab_2_30]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 144], m2
> >+
> >+pmaddubsw m2, m1, [tab_4_28]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 148], m2
> >+
> >+pmaddubsw m2, m1, [tab_6_26]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 152], m2
> >+
> >+pmaddubsw m2, m1, [tab_8_24]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 156], m2
> >+
> >+; mode 12
> >+
> >+pmaddubsw m2, m1, [tab_5_27]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 160], m2
> >+
> >+pmaddubsw m2, m1, [tab_10_22]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 164], m2
> >+
> >+pmaddubsw m2, m1, [tab_15_17]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 168], m2
> >+
> >+pmaddubsw m2, m1, [tab_20_12]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 172], m2
> >+
> >+; mode 13
> >+
> >+pmaddubsw m2, m1, [tab_9_23]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 176], m2
> >+
> >+pmaddubsw m2, m1, [tab_18_14]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 180], m2
> >+
> >+pmaddubsw m2, m1, [tab_27_5]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 184], m2
> >+
> >+movh m2, [r2 - 1]
> >+pinsrb m2, [r1 + 4], 0
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m2, [tab_4_28]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 188], m2
> >+
> >+; mode 14
> >+
> >+pmaddubsw m2, m1, [tab_13_19]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 192], m2
> >+
> >+pmaddubsw m5, m1, [tab_26_6]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 196], m5
> >+
> >+movh m2, [r2 - 1]
> >+pinsrb m2, [r1 + 2], 0
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m3, m2, [tab_7_25]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 200], m3
> >+
> >+pmaddubsw m3, m2, [tab_20_12]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 204], m3
> >+
> >+; mode 15
> >+
> >+pmaddubsw m3, m1, [tab_17_15]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 208], m3
> >+
> >+pmaddubsw m3, m2, [tab_2_30]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 212], m3
> >+
> >+pmaddubsw m3, m2, [tab_19_13]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 216], m3
> >+
> >+movh m3, [r2 - 2]
> >+pinsrb m3, [r1 + 4], 0
> >+pinsrb m3, [r1 + 2], 1
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_4_28]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 220], m3
> >+
> >+; mode 16
> >+
> >+pmaddubsw m3, m1, [tab_21_11]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 224], m3
> >+
> >+pmaddubsw m3, m2, [tab_10_22]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 228], m3
> >+
> >+pmaddubsw m3, m2, [tab_31_1]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 232], m3
> >+
> >+movh m3, [r2 - 2]
> >+pinsrb m3, [r1 + 3], 0
> >+pinsrb m3, [r1 + 2], 1
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_20_12]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 236], m3
> >+
> >+; mode 17
> >+
> >+movd [r0 + 240], m5
> >+
> >+movh m3, [r2 - 1]
> >+pinsrb m3, [r1 + 1], 0
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_20_12]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 244], m3
> >+
> >+movh m3, [r2 - 2]
> >+pinsrb m3, [r1 + 2], 0
> >+pinsrb m3, [r1 + 1], 1
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_14_18]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 248], m3
> >+
> >+movh m3, [r1]
> >+pshufb m3, [tab_Si0]
> >+pinsrb m3, [r2 + 1], 4
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_8_24]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 252], m3
> >+
> >+; mode 18
> >+
> >+movd m3, [r1]
> >+movd [r0 + 256], m3
> >+
> >+movh m3, [r1 - 1]
> >+pinsrb m3, [r2 + 1], 0
> >+movd [r0 + 260], m3
> >+
> >+movh m3, [r1 - 2]
> >+pinsrb m3, [r2 + 2], 0
> >+pinsrb m3, [r2 + 1], 1
> >+movd [r0 + 264], m3
> >+
> >+movh m3, [r2]
> >+pshufb m3, [tab_Si1]
> >+pinsrb m3, [r1], 3
> >+movd [r0 + 268], m3
> >+
> >+; mode 19
> >+
> >+movh m1, [r1]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m5, m1, [tab_26_6]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 272], m5
> >+
> >+movh m2, [r1 - 1]
> >+pinsrb m2, [r2 + 1], 0
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m2, [tab_20_12]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 276], m2
> >+
> >+movh m2, [r1 - 2]
> >+pinsrb m2, [r2 + 2], 0
> >+pinsrb m2, [r2 + 1], 1
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m2, [tab_14_18]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 280], m2
> >+
> >+movh m2, [r2]
> >+pshufb m2, [tab_Si0]
> >+pinsrb m2, [r1 + 1], 4
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m2, [tab_8_24]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 284], m2
> >+
> >+; mode 20
> >+
> >+pmaddubsw m2, m1, [tab_21_11]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 288], m2
> >+
> >+movh m2, [r1 - 1]
> >+pinsrb m2, [r2 + 2], 0
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m3, m2, [tab_10_22]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 292], m3
> >+
> >+pmaddubsw m3, m2, [tab_31_1]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 296], m3
> >+
> >+movh m3, [r1 - 2]
> >+pinsrb m3, [r2 + 3], 0
> >+pinsrb m3, [r2 + 2], 1
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_20_12]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 300], m3
> >+
> >+; mode 21
> >+
> >+pmaddubsw m3, m1, [tab_17_15]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 304], m3
> >+
> >+pmaddubsw m3, m2, [tab_2_30]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 308], m3
> >+
> >+pmaddubsw m3, m2, [tab_19_13]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 312], m3
> >+
> >+movh m3, [r1 - 2]
> >+pinsrb m3, [r2 + 4], 0
> >+pinsrb m3, [r2 + 2], 1
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m3, [tab_4_28]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 316], m3
> >+
> >+; mode 22
> >+
> >+pmaddubsw m3, m1, [tab_13_19]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 320], m3
> >+
> >+movd [r0 + 324], m5
> >+
> >+pmaddubsw m3, m2, [tab_7_25]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 328], m3
> >+
> >+pmaddubsw m3, m2, [tab_20_12]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 332], m3
> >+
> >+; mode 23
> >+
> >+pmaddubsw m2, m1, [tab_9_23]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 336], m2
> >+
> >+pmaddubsw m2, m1, [tab_18_14]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 340], m2
> >+
> >+pmaddubsw m2, m1, [tab_27_5]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 344], m2
> >+
> >+movh m2, [r1 - 1]
> >+pinsrb m2, [r2 + 4], 0
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m2, [tab_4_28]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 348], m2
> >+
> >+; mode 24
> >+
> >+pmaddubsw m2, m1, [tab_5_27]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 352], m2
> >+
> >+pmaddubsw m2, m1, [tab_10_22]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 356], m2
> >+
> >+pmaddubsw m2, m1, [tab_15_17]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 360], m2
> >+
> >+pmaddubsw m2, m1, [tab_20_12]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 364], m2
> >+
> >+; mode 25
> >+
> >+pmaddubsw m2, m1, [tab_2_30]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 368], m2
> >+
> >+pmaddubsw m2, m1, [tab_4_28]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 372], m2
> >+
> >+pmaddubsw m2, m1, [tab_6_26]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 376], m2
> >+
> >+pmaddubsw m2, m1, [tab_8_24]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 380], m2
> >+
> >+; mode 26
> >+
> >+movd m1, [r1 + 1]
> >+pshufd m2, m1, 0
> >+movu [r0 + 384], m2
> >+
> >+mova m2, [tab_Zero]
> >+
> >+pshufb m3, m1, m2
> >+punpcklbw m3, m2
> >+
> >+movd m1, [r2]
> >+
> >+pshufb m1, m2
> >+punpcklbw m1, m2
> >+
> >+movd m4, [r2 + 1]
> >+punpcklbw m4, m2
> >+
> >+psubw m4, m1
> >+psraw m4, 1
> >+
> >+paddw m3, m4
> >+
> >+packuswb m3, m2
> >+
> >+pextrb [r0 + 384], m3, 0
> >+pextrb [r0 + 388], m3, 1
> >+pextrb [r0 + 392], m3, 2
> >+pextrb [r0 + 396], m3, 3
> >+
> >+; mode 27
> >+
> >+movh m1, [r1 + 1]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m2, m1, [tab_30_2]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 400], m2
> >+
> >+pmaddubsw m2, m1, [tab_28_4]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 404], m2
> >+
> >+pmaddubsw m2, m1, [tab_26_6]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 408], m2
> >+
> >+pmaddubsw m2, m1, [tab_24_8]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 412], m2
> >+
> >+; mode 28
> >+
> >+pmaddubsw m2, m1, [tab_27_5]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 416], m2
> >+
> >+pmaddubsw m2, m1, [tab_22_10]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 420], m2
> >+
> >+pmaddubsw m2, m1, [tab_17_15]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 424], m2
> >+
> >+pmaddubsw m2, m1, [tab_12_20]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 428], m2
> >+
> >+; mode 29
> >+
> >+pmaddubsw m2, m1, [tab_23_9]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 432], m2
> >+
> >+pmaddubsw m2, m1, [tab_14_18]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 436], m2
> >+
> >+pmaddubsw m2, m1, [tab_5_27]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 440], m2
> >+
> >+movh m2, [r1 + 2]
> >+
> >+palignr m3, m2, 1
> >+punpcklbw m2, m3
> >+
> >+pmaddubsw m3, m2, [tab_28_4]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 444], m3
> >+
> >+; mode 30
> >+
> >+pmaddubsw m3, m1, [tab_19_13]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 448], m3
> >+
> >+pmaddubsw m6, m1, [tab_6_26]
> >+pmulhrsw m6, m0
> >+packuswb m6, m6
> >+movd [r0 + 452], m6
> >+
> >+pmaddubsw m3, m2, [tab_25_7]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 456], m3
> >+
> >+pmaddubsw m5, m2, [tab_12_20]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 460], m5
> >+
> >+; mode 31
> >+
> >+pmaddubsw m3, m1, [tab_15_17]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 464], m3
> >+
> >+pmaddubsw m3, m2, [tab_30_2]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 468], m3
> >+
> >+pmaddubsw m3, m2, [tab_13_19]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 472], m3
> >+
> >+movh m3, [r1 + 3]
> >+
> >+palignr m4, m3, 1
> >+punpcklbw m3, m4
> >+
> >+pmaddubsw m4, m3, [tab_28_4]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 476], m4
> >+
> >+; mode 32
> >+
> >+pmaddubsw m4, m1, [tab_11_21]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 480], m4
> >+
> >+pmaddubsw m4, m2, [tab_22_10]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 484], m4
> >+
> >+pmaddubsw m4, m2, [tab_1_31]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 488], m4
> >+
> >+pmaddubsw m4, m3, [tab_12_20]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 492], m4
> >+
> >+; mode 33
> >+
> >+movd [r0 + 496], m6
> >+
> >+movd [r0 + 500], m5
> >+
> >+pmaddubsw m3, [tab_18_14]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 504], m3
> >+
> >+movh m1, [r1 + 4]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m1, [tab_24_8]
> >+pmulhrsw m1, m0
> >+packuswb m1, m1
> >+movd [r0 + 508], m1
> >+
> >+; mode 34
> >+
> >+movd m0, [r1 + 2]
> >+movd [r0 + 512], m0
> >+
> >+movd m0, [r1 + 3]
> >+movd [r0 + 516], m0
> >+
> >+movd m0, [r1 + 4]
> >+movd [r0 + 520], m0
> >+
> >+movd m0, [r1 + 5]
> >+movd [r0 + 524], m0
> >+
> >+RET
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131203/d599fb89/attachment-0001.html>
More information about the x265-devel
mailing list