<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>Code is right, </DIV>
<DIV>there have some reduce register copy operator, the biggest problem is wasting half of register, eg: mode 4 and mode 32 can generate in same register.</DIV>
<DIV><BR>At 2013-12-02 22:47:06,praveen@multicorewareinc.com wrote:<BR>># HG changeset patch<BR>># User Praveen Tiwari<BR>># Date 1385995608 -19800<BR>># Node ID 1fccf1a770233907fefb0f5a47ed4e7c17223d4a<BR>># Parent df0b4f81609e611989c5b1743e7729adeb51cb01<BR>>asm code for intra_allangs4x4 [all 34 modes]<BR>><BR>>diff -r df0b4f81609e -r 1fccf1a77023 source/common/x86/allangspred.asm<BR>>--- /dev/null Thu Jan 01 00:00:00 1970 +0000<BR>>+++ b/source/common/x86/allangspred.asm Mon Dec 02 20:16:48 2013 +0530<BR>>@@ -0,0 +1,920 @@<BR>>+;*****************************************************************************<BR>>+;* Copyright (C) 2013 x265 project<BR>>+;*<BR>>+;* Authors: Praveen Kumar Tiwari <praveen@multicorewareinc.com><BR>>+;* <BR>>+;* This program is free software; you can redistribute it and/or modify<BR>>+;* it under the terms of the GNU General Public License as published by<BR>>+;* the Free Software Foundation; either version 2 of the License, or<BR>>+;* (at your option) any later version.<BR>>+;*<BR>>+;* This program is distributed in the hope that it will be useful,<BR>>+;* but WITHOUT ANY WARRANTY; without even the implied warranty of<BR>>+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the<BR>>+;* GNU General Public License for more details.<BR>>+;*<BR>>+;* You should have received a copy of the GNU General Public License<BR>>+;* along with this program; if not, write to the Free Software<BR>>+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.<BR>>+;*<BR>>+;* This program is also available under a commercial proprietary license.<BR>>+;* For more information, contact us at licensing@multicorewareinc.com.<BR>>+;*****************************************************************************/<BR>>+<BR>>+%include "x86inc.asm"<BR>>+<BR>>+SECTION_RODATA 32<BR>>+<BR>>+tab_6_26: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0<BR>>+tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20<BR>>+tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14<BR>>+tab_24_8: db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8<BR>>+tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21<BR>>+tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10<BR>>+tab_1_31: db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31<BR>>+tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17<BR>>+tab_30_2: db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2<BR>>+tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19<BR>>+tab_28_4: db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4<BR>>+tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13<BR>>+tab_25_7: db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7<BR>>+tab_23_9: db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9<BR>>+tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18<BR>>+tab_5_27: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27<BR>>+tab_27_5: db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5<BR>>+tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15<BR>>+tab_26_6: db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6<BR>>+tab_2_30: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30<BR>>+tab_4_28: db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28<BR>>+tab_8_24: db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24<BR>>+tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22<BR>>+tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12<BR>>+tab_9_23: db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23<BR>>+tab_7_25: db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25<BR>>+tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11<BR>>+tab_31_1: db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1<BR>>+<BR>>+pw_1024: dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024<BR>>+<BR>>+tab_Si0: db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0<BR>>+tab_Si1: db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0<BR>>+tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0<BR>>+<BR>>+SECTION .text<BR>>+<BR>>+;-----------------------------------------------------------------------------<BR>>+; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)<BR>>+;-----------------------------------------------------------------------------<BR>>+INIT_XMM sse4<BR>>+cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma<BR>>+<BR>>+; mode 2<BR>>+<BR>>+movd m0, [r2 + 2]<BR>>+movd m1, [r2 + 3]<BR>>+movd m2, [r2 + 4]<BR>>+movd m3, [r2 + 5]<BR>>+<BR>>+movd [r0], m0<BR>>+movd [r0 + 4], m1<BR>>+movd [r0 + 8], m2<BR>>+movd [r0 + 12], m3<BR>>+<BR>>+; mode 3<BR>>+<BR>>+mova m0, [pw_1024]<BR>>+<BR>>+movu m1, [r2 + 1]<BR>>+<BR>>+palignr m2, m1, 1<BR>>+punpcklbw m1, m2<BR>>+<BR>>+pmaddubsw m7, m1, [tab_6_26]<BR>>+pmulhrsw m7, m0<BR>>+packuswb m7, m7<BR>>+movd [r0 + 16], m7<BR>>+<BR>>+movu m2, [r2 + 2]<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m6, m2, [tab_12_20]<BR>>+pmulhrsw m6, m0<BR>>+packuswb m6, m6<BR>>+movd [r0 + 20], m6<BR>>+<BR>>+movu m3, [r2 + 3]<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m4, m3, [tab_18_14]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 24], m4<BR>>+<BR>>+movu m4, [r2 + 4]<BR>>+<BR>>+palignr m5, m4, 1<BR>>+punpcklbw m4, m5<BR>>+<BR>>+pmaddubsw m4, [tab_24_8]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 28], m4<BR>>+<BR>>+; mode 4<BR>>+<BR>>+pmaddubsw m4, m1, [tab_11_21]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 32], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_22_10]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 36], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_1_31]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 40], m4<BR>>+<BR>>+pmaddubsw m4, m3, [tab_12_20]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 44], m4<BR>>+<BR>>+; mode 5<BR>>+<BR>>+pmaddubsw m4, m1, [tab_15_17]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 48], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_30_2]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 52], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_13_19]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 56], m4<BR>>+<BR>>+pmaddubsw m3, [tab_28_4]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 60], m3<BR>>+<BR>>+; mode 6<BR>>+<BR>>+pmaddubsw m3, m1, [tab_19_13]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 64], m3<BR>>+<BR>>+movd [r0 + 68], m7<BR>>+<BR>>+pmaddubsw m3, m2, [tab_25_7]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 72], m3<BR>>+<BR>>+movd [r0 + 76], m6<BR>>+<BR>>+; mode 7<BR>>+<BR>>+pmaddubsw m3, m1, [tab_23_9]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 80], m3<BR>>+<BR>>+pmaddubsw m3, m1, [tab_14_18]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 84], m3<BR>>+<BR>>+pmaddubsw m3, m1, [tab_5_27]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 88], m3<BR>>+<BR>>+pmaddubsw m2, [tab_28_4]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 92], m2<BR>>+<BR>>+; mode 8<BR>>+<BR>>+pmaddubsw m2, m1, [tab_27_5]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 96], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_22_10]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 100], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_17_15]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 104], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_12_20]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 108], m2<BR>>+<BR>>+; mode 9<BR>>+<BR>>+pmaddubsw m2, m1, [tab_30_2]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 112], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_28_4]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 116], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_26_6]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 120], m2<BR>>+<BR>>+pmaddubsw m1, [tab_24_8]<BR>>+pmulhrsw m1, m0<BR>>+packuswb m1, m1<BR>>+movd [r0 + 124], m1<BR>>+<BR>>+; mode 10<BR>>+<BR>>+movd m1, [r2 + 1]<BR>>+pshufd m2, m1, 0<BR>>+movu [r0 + 128], m2<BR>>+<BR>>+mova m2, [tab_Zero]<BR>>+<BR>>+pshufb m3, m1, m2<BR>>+punpcklbw m3, m2<BR>>+<BR>>+movd m1, [r1]<BR>>+<BR>>+pshufb m1, m2<BR>>+punpcklbw m1, m2<BR>>+<BR>>+movd m4, [r1 + 1]<BR>>+punpcklbw m4, m2<BR>>+<BR>>+psubw m4, m1<BR>>+psraw m4, 1<BR>>+<BR>>+paddw m3, m4<BR>>+<BR>>+packuswb m3, m2<BR>>+<BR>>+pextrb [r0 + 128], m3, 0<BR>>+pextrb [r0 + 132], m3, 1<BR>>+pextrb [r0 + 136], m3, 2<BR>>+pextrb [r0 + 140], m3, 3<BR>>+<BR>>+; mode 11<BR>>+<BR>>+movu m1, [r2]<BR>>+<BR>>+palignr m2, m1, 1<BR>>+punpcklbw m1, m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_2_30]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 144], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_4_28]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 148], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_6_26]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 152], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_8_24]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 156], m2<BR>>+<BR>>+; mode 12<BR>>+<BR>>+pmaddubsw m2, m1, [tab_5_27]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 160], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_10_22]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 164], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_15_17]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 168], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_20_12]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 172], m2<BR>>+<BR>>+; mode 13<BR>>+<BR>>+pmaddubsw m2, m1, [tab_9_23]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 176], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_18_14]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 180], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_27_5]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 184], m2<BR>>+<BR>>+movh m2, [r2 - 1]<BR>>+pinsrb m2, [r1 + 4], 0<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m2, [tab_4_28]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 188], m2<BR>>+<BR>>+; mode 14<BR>>+<BR>>+pmaddubsw m2, m1, [tab_13_19]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 192], m2<BR>>+<BR>>+pmaddubsw m5, m1, [tab_26_6]<BR>>+pmulhrsw m5, m0<BR>>+packuswb m5, m5<BR>>+movd [r0 + 196], m5<BR>>+<BR>>+movh m2, [r2 - 1]<BR>>+pinsrb m2, [r1 + 2], 0<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_7_25]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 200], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_20_12]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 204], m3<BR>>+<BR>>+; mode 15<BR>>+<BR>>+pmaddubsw m3, m1, [tab_17_15]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 208], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_2_30]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 212], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_19_13]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 216], m3<BR>>+<BR>>+movh m3, [r2 - 2]<BR>>+pinsrb m3, [r1 + 4], 0<BR>>+pinsrb m3, [r1 + 2], 1<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_4_28]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 220], m3<BR>>+<BR>>+; mode 16<BR>>+<BR>>+pmaddubsw m3, m1, [tab_21_11]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 224], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_10_22]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 228], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_31_1]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 232], m3<BR>>+<BR>>+movh m3, [r2 - 2]<BR>>+pinsrb m3, [r1 + 3], 0<BR>>+pinsrb m3, [r1 + 2], 1<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_20_12]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 236], m3<BR>>+<BR>>+; mode 17<BR>>+<BR>>+movd [r0 + 240], m5<BR>>+<BR>>+movh m3, [r2 - 1]<BR>>+pinsrb m3, [r1 + 1], 0<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_20_12]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 244], m3<BR>>+<BR>>+movh m3, [r2 - 2]<BR>>+pinsrb m3, [r1 + 2], 0<BR>>+pinsrb m3, [r1 + 1], 1<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_14_18]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 248], m3<BR>>+<BR>>+movh m3, [r1]<BR>>+pshufb m3, [tab_Si0]<BR>>+pinsrb m3, [r2 + 1], 4<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_8_24]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 252], m3<BR>>+<BR>>+; mode 18<BR>>+<BR>>+movd m3, [r1]<BR>>+movd [r0 + 256], m3<BR>>+<BR>>+movh m3, [r1 - 1]<BR>>+pinsrb m3, [r2 + 1], 0<BR>>+movd [r0 + 260], m3<BR>>+<BR>>+movh m3, [r1 - 2]<BR>>+pinsrb m3, [r2 + 2], 0<BR>>+pinsrb m3, [r2 + 1], 1<BR>>+movd [r0 + 264], m3<BR>>+<BR>>+movh m3, [r2]<BR>>+pshufb m3, [tab_Si1]<BR>>+pinsrb m3, [r1], 3<BR>>+movd [r0 + 268], m3<BR>>+<BR>>+; mode 19<BR>>+<BR>>+movh m1, [r1]<BR>>+<BR>>+palignr m2, m1, 1<BR>>+punpcklbw m1, m2<BR>>+<BR>>+pmaddubsw m5, m1, [tab_26_6]<BR>>+pmulhrsw m5, m0<BR>>+packuswb m5, m5<BR>>+movd [r0 + 272], m5<BR>>+<BR>>+movh m2, [r1 - 1]<BR>>+pinsrb m2, [r2 + 1], 0<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m2, [tab_20_12]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 276], m2<BR>>+<BR>>+movh m2, [r1 - 2]<BR>>+pinsrb m2, [r2 + 2], 0<BR>>+pinsrb m2, [r2 + 1], 1<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m2, [tab_14_18]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 280], m2<BR>>+<BR>>+movh m2, [r2]<BR>>+pshufb m2, [tab_Si0]<BR>>+pinsrb m2, [r1 + 1], 4<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m2, [tab_8_24]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 284], m2<BR>>+<BR>>+; mode 20<BR>>+<BR>>+pmaddubsw m2, m1, [tab_21_11]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 288], m2<BR>>+<BR>>+movh m2, [r1 - 1]<BR>>+pinsrb m2, [r2 + 2], 0<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_10_22]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 292], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_31_1]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 296], m3<BR>>+<BR>>+movh m3, [r1 - 2]<BR>>+pinsrb m3, [r2 + 3], 0<BR>>+pinsrb m3, [r2 + 2], 1<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_20_12]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 300], m3<BR>>+<BR>>+; mode 21<BR>>+<BR>>+pmaddubsw m3, m1, [tab_17_15]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 304], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_2_30]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 308], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_19_13]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 312], m3<BR>>+<BR>>+movh m3, [r1 - 2]<BR>>+pinsrb m3, [r2 + 4], 0<BR>>+pinsrb m3, [r2 + 2], 1<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m3, [tab_4_28]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 316], m3<BR>>+<BR>>+; mode 22<BR>>+<BR>>+pmaddubsw m3, m1, [tab_13_19]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 320], m3<BR>>+<BR>>+movd [r0 + 324], m5<BR>>+<BR>>+pmaddubsw m3, m2, [tab_7_25]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 328], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_20_12]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 332], m3<BR>>+<BR>>+; mode 23<BR>>+<BR>>+pmaddubsw m2, m1, [tab_9_23]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 336], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_18_14]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 340], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_27_5]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 344], m2<BR>>+<BR>>+movh m2, [r1 - 1]<BR>>+pinsrb m2, [r2 + 4], 0<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m2, [tab_4_28]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 348], m2<BR>>+<BR>>+; mode 24<BR>>+<BR>>+pmaddubsw m2, m1, [tab_5_27]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 352], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_10_22]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 356], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_15_17]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 360], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_20_12]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 364], m2<BR>>+<BR>>+; mode 25<BR>>+<BR>>+pmaddubsw m2, m1, [tab_2_30]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 368], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_4_28]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 372], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_6_26]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 376], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_8_24]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 380], m2<BR>>+<BR>>+; mode 26<BR>>+<BR>>+movd m1, [r1 + 1]<BR>>+pshufd m2, m1, 0<BR>>+movu [r0 + 384], m2<BR>>+<BR>>+mova m2, [tab_Zero]<BR>>+<BR>>+pshufb m3, m1, m2<BR>>+punpcklbw m3, m2<BR>>+<BR>>+movd m1, [r2]<BR>>+<BR>>+pshufb m1, m2<BR>>+punpcklbw m1, m2<BR>>+<BR>>+movd m4, [r2 + 1]<BR>>+punpcklbw m4, m2<BR>>+<BR>>+psubw m4, m1<BR>>+psraw m4, 1<BR>>+<BR>>+paddw m3, m4<BR>>+<BR>>+packuswb m3, m2<BR>>+<BR>>+pextrb [r0 + 384], m3, 0<BR>>+pextrb [r0 + 388], m3, 1<BR>>+pextrb [r0 + 392], m3, 2<BR>>+pextrb [r0 + 396], m3, 3<BR>>+<BR>>+; mode 27<BR>>+<BR>>+movh m1, [r1 + 1]<BR>>+<BR>>+palignr m2, m1, 1<BR>>+punpcklbw m1, m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_30_2]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 400], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_28_4]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 404], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_26_6]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 408], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_24_8]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 412], m2<BR>>+<BR>>+; mode 28<BR>>+<BR>>+pmaddubsw m2, m1, [tab_27_5]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 416], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_22_10]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 420], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_17_15]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 424], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_12_20]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 428], m2<BR>>+<BR>>+; mode 29<BR>>+<BR>>+pmaddubsw m2, m1, [tab_23_9]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 432], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_14_18]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 436], m2<BR>>+<BR>>+pmaddubsw m2, m1, [tab_5_27]<BR>>+pmulhrsw m2, m0<BR>>+packuswb m2, m2<BR>>+movd [r0 + 440], m2<BR>>+<BR>>+movh m2, [r1 + 2]<BR>>+<BR>>+palignr m3, m2, 1<BR>>+punpcklbw m2, m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_28_4]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 444], m3<BR>>+<BR>>+; mode 30<BR>>+<BR>>+pmaddubsw m3, m1, [tab_19_13]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 448], m3<BR>>+<BR>>+pmaddubsw m6, m1, [tab_6_26]<BR>>+pmulhrsw m6, m0<BR>>+packuswb m6, m6<BR>>+movd [r0 + 452], m6<BR>>+<BR>>+pmaddubsw m3, m2, [tab_25_7]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 456], m3<BR>>+<BR>>+pmaddubsw m5, m2, [tab_12_20]<BR>>+pmulhrsw m5, m0<BR>>+packuswb m5, m5<BR>>+movd [r0 + 460], m5<BR>>+<BR>>+; mode 31<BR>>+<BR>>+pmaddubsw m3, m1, [tab_15_17]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 464], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_30_2]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 468], m3<BR>>+<BR>>+pmaddubsw m3, m2, [tab_13_19]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 472], m3<BR>>+<BR>>+movh m3, [r1 + 3]<BR>>+<BR>>+palignr m4, m3, 1<BR>>+punpcklbw m3, m4<BR>>+<BR>>+pmaddubsw m4, m3, [tab_28_4]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 476], m4<BR>>+<BR>>+; mode 32<BR>>+<BR>>+pmaddubsw m4, m1, [tab_11_21]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 480], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_22_10]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 484], m4<BR>>+<BR>>+pmaddubsw m4, m2, [tab_1_31]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 488], m4<BR>>+<BR>>+pmaddubsw m4, m3, [tab_12_20]<BR>>+pmulhrsw m4, m0<BR>>+packuswb m4, m4<BR>>+movd [r0 + 492], m4<BR>>+<BR>>+; mode 33<BR>>+<BR>>+movd [r0 + 496], m6<BR>>+<BR>>+movd [r0 + 500], m5<BR>>+<BR>>+pmaddubsw m3, [tab_18_14]<BR>>+pmulhrsw m3, m0<BR>>+packuswb m3, m3<BR>>+movd [r0 + 504], m3<BR>>+<BR>>+movh m1, [r1 + 4]<BR>>+<BR>>+palignr m2, m1, 1<BR>>+punpcklbw m1, m2<BR>>+<BR>>+pmaddubsw m1, [tab_24_8]<BR>>+pmulhrsw m1, m0<BR>>+packuswb m1, m1<BR>>+movd [r0 + 508], m1<BR>>+<BR>>+; mode 34<BR>>+<BR>>+movd m0, [r1 + 2]<BR>>+movd [r0 + 512], m0<BR>>+<BR>>+movd m0, [r1 + 3]<BR>>+movd [r0 + 516], m0<BR>>+<BR>>+movd m0, [r1 + 4]<BR>>+movd [r0 + 520], m0<BR>>+<BR>>+movd m0, [r1 + 5]<BR>>+movd [r0 + 524], m0<BR>>+<BR>>+RET<BR>>_______________________________________________<BR>>x265-devel mailing list<BR>>x265-devel@videolan.org<BR>>https://mailman.videolan.org/listinfo/x265-devel<BR></DIV></div>