[x265] [PATCH] moved all_ands_pred_4x4, all modes to intrapred8.asm file

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Dec 3 13:57:48 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1386075447 -19800
# Node ID a9257e7012d7e2526176580e8e5854d5e2a9815c
# Parent  d18c574e0ce928adcbeb2438b9d291058bffb928
moved all_ands_pred_4x4, all modes to intrapred8.asm file

diff -r d18c574e0ce9 -r a9257e7012d7 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Tue Dec 03 14:51:09 2013 +0530
+++ b/source/common/CMakeLists.txt	Tue Dec 03 18:27:27 2013 +0530
@@ -118,10 +118,10 @@
 endif(ENABLE_PRIMITIVES_VEC)
 
 if(ENABLE_PRIMITIVES_ASM)
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h allangs-pred.h)
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
     set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm ssd-a.asm mc-a.asm
                mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm intrapred8.asm
-               pixeladd8.asm dct8.asm allangs-pred8.asm)
+               pixeladd8.asm dct8.asm)
     if (NOT X64)
         set(A_SRCS ${A_SRCS} pixel-32.asm)
     endif()
diff -r d18c574e0ce9 -r a9257e7012d7 source/common/x86/allangs-pred.h
--- a/source/common/x86/allangs-pred.h	Tue Dec 03 14:51:09 2013 +0530
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,31 +0,0 @@
-/*****************************************************************************
- * allangspred.h: Intra Prediction metrics
- *****************************************************************************
- * Copyright (C) 2003-2013 x264 project
- *
- * Authors: Praveen Kumar Tiwari<praveen at multicorewareinc.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at licensing at x264.com.
- *****************************************************************************/
-
-#ifndef X265_ALLANGSPRED_H
-#define X265_ALLANGSPRED_H
-
-void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
-
-#endif
diff -r d18c574e0ce9 -r a9257e7012d7 source/common/x86/allangs-pred8.asm
--- a/source/common/x86/allangs-pred8.asm	Tue Dec 03 14:51:09 2013 +0530
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,920 +0,0 @@
-;*****************************************************************************
-;* Copyright (C) 2013 x265 project
-;*
-;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
-;* 
-;* This program is free software; you can redistribute it and/or modify
-;* it under the terms of the GNU General Public License as published by
-;* the Free Software Foundation; either version 2 of the License, or
-;* (at your option) any later version.
-;*
-;* This program is distributed in the hope that it will be useful,
-;* but WITHOUT ANY WARRANTY; without even the implied warranty of
-;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-;* GNU General Public License for more details.
-;*
-;* You should have received a copy of the GNU General Public License
-;* along with this program; if not, write to the Free Software
-;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
-;*
-;* This program is also available under a commercial proprietary license.
-;* For more information, contact us at licensing at multicorewareinc.com.
-;*****************************************************************************/
-
-%include "x86inc.asm"
-
-SECTION_RODATA 32
-
-tab_6_26:  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0
-tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
-tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
-tab_24_8:  db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
-tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
-tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
-tab_1_31:  db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
-tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
-tab_30_2:  db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
-tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
-tab_28_4:  db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
-tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
-tab_25_7:  db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
-tab_23_9:  db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
-tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
-tab_5_27:  db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
-tab_27_5:  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
-tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
-tab_26_6:  db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
-tab_2_30:  db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
-tab_4_28:  db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
-tab_8_24:  db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
-tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
-tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
-tab_9_23:  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
-tab_7_25:  db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
-tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
-tab_31_1:  db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
-
-pw_1024:   dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
-
-tab_Si0:  db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0
-tab_Si1:  db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
-SECTION .text
-
-;-----------------------------------------------------------------------------
-; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
-;-----------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma
-
-; mode 2
-
-movd     m0,        [r2 + 2]
-movd     m1,        [r2 + 3]
-movd     m2,        [r2 + 4]
-movd     m3,        [r2 + 5]
-
-movd     [r0],      m0
-movd     [r0 + 4],  m1
-movd     [r0 + 8],  m2
-movd     [r0 + 12], m3
-
-; mode 3
-
-mova          m0,        [pw_1024]
-
-movu          m1,        [r2 + 1]
-
-palignr       m2,        m1,        1
-punpcklbw     m1,        m2
-
-pmaddubsw     m7,        m1,        [tab_6_26]
-pmulhrsw      m7,        m0
-packuswb      m7,        m7
-movd          [r0 + 16], m7
-
-movu          m2,        [r2 + 2]
-
-palignr       m3,        m2,        1
-punpcklbw     m2,        m3
-
-pmaddubsw     m6,        m2,        [tab_12_20]
-pmulhrsw      m6,        m0
-packuswb      m6,        m6
-movd          [r0 + 20], m6
-
-movu          m3,        [r2 + 3]
-
-palignr       m4,        m3,        1
-punpcklbw     m3,        m4
-
-pmaddubsw     m4,        m3,        [tab_18_14]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 24], m4
-
-movu          m4,        [r2 + 4]
-
-palignr       m5,        m4,        1
-punpcklbw     m4,        m5
-
-pmaddubsw     m4,        [tab_24_8]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 28], m4
-
-; mode 4
-
-pmaddubsw     m4,        m1,        [tab_11_21]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 32], m4
-
-pmaddubsw     m4,        m2,        [tab_22_10]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 36], m4
-
-pmaddubsw     m4,        m2,        [tab_1_31]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 40], m4
-
-pmaddubsw     m4,        m3,        [tab_12_20]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 44], m4
-
-; mode 5
-
-pmaddubsw     m4,        m1,        [tab_15_17]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 48], m4
-
-pmaddubsw     m4,        m2,        [tab_30_2]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 52], m4
-
-pmaddubsw     m4,        m2,        [tab_13_19]
-pmulhrsw      m4,        m0
-packuswb      m4,        m4
-movd          [r0 + 56], m4
-
-pmaddubsw     m3,        [tab_28_4]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 60], m3
-
-; mode 6
-
-pmaddubsw     m3,        m1,        [tab_19_13]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 64], m3
-
-movd          [r0 + 68], m7
-
-pmaddubsw     m3,        m2,        [tab_25_7]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 72], m3
-
-movd          [r0 + 76], m6
-
-; mode 7
-
-pmaddubsw     m3,        m1,        [tab_23_9]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 80], m3
-
-pmaddubsw     m3,        m1,        [tab_14_18]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 84], m3
-
-pmaddubsw     m3,        m1,        [tab_5_27]
-pmulhrsw      m3,        m0
-packuswb      m3,        m3
-movd          [r0 + 88], m3
-
-pmaddubsw     m2,        [tab_28_4]
-pmulhrsw      m2,        m0
-packuswb      m2,        m2
-movd          [r0 + 92], m2
-
-; mode 8
-
-pmaddubsw     m2,         m1,       [tab_27_5]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 96],  m2
-
-pmaddubsw     m2,         m1,       [tab_22_10]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 100], m2
-
-pmaddubsw     m2,         m1,       [tab_17_15]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 104], m2
-
-pmaddubsw     m2,         m1,       [tab_12_20]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 108], m2
-
-; mode 9
-
-pmaddubsw     m2,         m1,       [tab_30_2]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 112], m2
-
-pmaddubsw     m2,         m1,       [tab_28_4]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 116], m2
-
-pmaddubsw     m2,         m1,       [tab_26_6]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 120], m2
-
-pmaddubsw     m1,         [tab_24_8]
-pmulhrsw      m1,         m0
-packuswb      m1,         m1
-movd          [r0 + 124], m1
-
-; mode 10
-
-movd          m1,         [r2 + 1]
-pshufd        m2,         m1,        0
-movu          [r0 + 128], m2
-
-mova         m2,          [tab_Zero]
-
-pshufb       m3,          m1,       m2
-punpcklbw    m3,          m2
-
-movd         m1,          [r1]
-
-pshufb       m1,          m2
-punpcklbw    m1,          m2
-
-movd         m4,          [r1 + 1]
-punpcklbw    m4,          m2
-
-psubw        m4,          m1
-psraw        m4,          1
-
-paddw        m3,          m4
-
-packuswb     m3,          m2
-
-pextrb       [r0 + 128],  m3,    0
-pextrb       [r0 + 132],  m3,    1
-pextrb       [r0 + 136],  m3,    2
-pextrb       [r0 + 140],  m3,    3
-
-; mode 11
-
-movu          m1,         [r2]
-
-palignr       m2,         m1,        1
-punpcklbw     m1,         m2
-
-pmaddubsw     m2,         m1,        [tab_2_30]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 144], m2
-
-pmaddubsw     m2,         m1,        [tab_4_28]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 148], m2
-
-pmaddubsw     m2,         m1,        [tab_6_26]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 152], m2
-
-pmaddubsw     m2,         m1,        [tab_8_24]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 156], m2
-
-; mode 12
-
-pmaddubsw     m2,         m1,        [tab_5_27]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 160], m2
-
-pmaddubsw     m2,         m1,        [tab_10_22]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 164], m2
-
-pmaddubsw     m2,         m1,        [tab_15_17]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 168], m2
-
-pmaddubsw     m2,         m1,        [tab_20_12]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 172], m2
-
-; mode 13
-
-pmaddubsw     m2,         m1,        [tab_9_23]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 176], m2
-
-pmaddubsw     m2,         m1,        [tab_18_14]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 180], m2
-
-pmaddubsw     m2,         m1,        [tab_27_5]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 184], m2
-
-movh          m2,         [r2 - 1]
-pinsrb        m2,         [r1 + 4],    0
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m2,         [tab_4_28]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 188], m2
-
-; mode 14
-
-pmaddubsw     m2,         m1,        [tab_13_19]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 192], m2
-
-pmaddubsw     m5,         m1,        [tab_26_6]
-pmulhrsw      m5,         m0
-packuswb      m5,         m5
-movd          [r0 + 196], m5
-
-movh          m2,         [r2 - 1]
-pinsrb        m2,         [r1 + 2],    0
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m3,         m2,        [tab_7_25]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 200], m3
-
-pmaddubsw     m3,         m2,        [tab_20_12]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 204], m3
-
-; mode 15
-
-pmaddubsw     m3,         m1,        [tab_17_15]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 208], m3
-
-pmaddubsw     m3,         m2,        [tab_2_30]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 212], m3
-
-pmaddubsw     m3,         m2,        [tab_19_13]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 216], m3
-
-movh          m3,         [r2 - 2]
-pinsrb        m3,         [r1 + 4],    0
-pinsrb        m3,         [r1 + 2],    1
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_4_28]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 220], m3
-
-; mode 16
-
-pmaddubsw     m3,         m1,        [tab_21_11]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 224], m3
-
-pmaddubsw     m3,         m2,        [tab_10_22]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 228], m3
-
-pmaddubsw     m3,         m2,        [tab_31_1]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 232], m3
-
-movh          m3,         [r2 - 2]
-pinsrb        m3,         [r1 + 3],  0
-pinsrb        m3,         [r1 + 2],  1
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_20_12]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 236], m3
-
-; mode 17
-
-movd          [r0 + 240], m5
-
-movh          m3,         [r2 - 1]
-pinsrb        m3,         [r1 + 1],  0
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_20_12]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 244], m3
-
-movh          m3,         [r2 - 2]
-pinsrb        m3,         [r1 + 2],  0
-pinsrb        m3,         [r1 + 1],  1
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_14_18]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 248], m3
-
-movh          m3,         [r1]
-pshufb        m3,         [tab_Si0]
-pinsrb        m3,         [r2 + 1],   4
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_8_24]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 252], m3
-
-; mode 18
-
-movd          m3,         [r1]
-movd          [r0 + 256], m3
-
-movh          m3,         [r1 - 1]
-pinsrb        m3,         [r2 + 1],  0
-movd          [r0 + 260], m3
-
-movh          m3,         [r1 - 2]
-pinsrb        m3,         [r2 + 2],  0
-pinsrb        m3,         [r2 + 1],  1
-movd          [r0 + 264], m3
-
-movh          m3,         [r2]
-pshufb        m3,         [tab_Si1]
-pinsrb        m3,         [r1],       3
-movd          [r0 + 268], m3
-
-; mode 19
-
-movh          m1,         [r1]
-
-palignr       m2,         m1,        1
-punpcklbw     m1,         m2
-
-pmaddubsw     m5,         m1,        [tab_26_6]
-pmulhrsw      m5,         m0
-packuswb      m5,         m5
-movd          [r0 + 272], m5
-
-movh          m2,         [r1 - 1]
-pinsrb        m2,         [r2 + 1],  0
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m2,         [tab_20_12]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 276], m2
-
-movh          m2,         [r1 - 2]
-pinsrb        m2,         [r2 + 2],  0
-pinsrb        m2,         [r2 + 1],  1
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m2,         [tab_14_18]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 280], m2
-
-movh          m2,         [r2]
-pshufb        m2,         [tab_Si0]
-pinsrb        m2,         [r1 + 1],  4
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m2,         [tab_8_24]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 284], m2
-
-; mode 20
-
-pmaddubsw     m2,         m1,        [tab_21_11]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 288], m2
-
-movh          m2,         [r1 - 1]
-pinsrb        m2,         [r2 + 2],  0
-
-palignr       m3,         m2,        1
-punpcklbw     m2,         m3
-
-pmaddubsw     m3,         m2,         [tab_10_22]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 292], m3
-
-pmaddubsw     m3,         m2,         [tab_31_1]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 296], m3
-
-movh          m3,         [r1 - 2]
-pinsrb        m3,         [r2 + 3],  0
-pinsrb        m3,         [r2 + 2],  1
-
-palignr       m4,         m3,        1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_20_12]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 300], m3
-
-; mode 21
-
-pmaddubsw     m3,         m1,         [tab_17_15]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 304], m3
-
-pmaddubsw     m3,         m2,         [tab_2_30]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 308], m3
-
-pmaddubsw     m3,         m2,         [tab_19_13]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 312], m3
-
-movh          m3,         [r1 - 2]
-pinsrb        m3,         [r2 + 4],   0
-pinsrb        m3,         [r2 + 2],   1
-
-palignr       m4,         m3,         1
-punpcklbw     m3,         m4
-
-pmaddubsw     m3,         [tab_4_28]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 316], m3
-
-; mode 22
-
-pmaddubsw     m3,         m1,         [tab_13_19]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 320], m3
-
-movd          [r0 + 324], m5
-
-pmaddubsw     m3,         m2,         [tab_7_25]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 328], m3
-
-pmaddubsw     m3,         m2,         [tab_20_12]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 332], m3
-
-; mode 23
-
-pmaddubsw     m2,         m1,         [tab_9_23]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 336], m2
-
-pmaddubsw     m2,         m1,         [tab_18_14]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 340], m2
-
-pmaddubsw     m2,         m1,         [tab_27_5]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 344], m2
-
-movh          m2,         [r1 - 1]
-pinsrb        m2,         [r2 + 4],   0
-
-palignr       m3,         m2,         1
-punpcklbw     m2,         m3
-
-pmaddubsw     m2,         [tab_4_28]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 348], m2
-
-; mode 24
-
-pmaddubsw     m2,         m1,         [tab_5_27]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 352], m2
-
-pmaddubsw     m2,         m1,         [tab_10_22]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 356], m2
-
-pmaddubsw     m2,         m1,         [tab_15_17]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 360], m2
-
-pmaddubsw     m2,         m1,         [tab_20_12]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 364], m2
-
-; mode 25
-
-pmaddubsw     m2,         m1,         [tab_2_30]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 368], m2
-
-pmaddubsw     m2,         m1,         [tab_4_28]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 372], m2
-
-pmaddubsw     m2,         m1,         [tab_6_26]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 376], m2
-
-pmaddubsw     m2,         m1,         [tab_8_24]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 380], m2
-
-; mode 26
-
-movd          m1,         [r1 + 1]
-pshufd        m2,         m1,        0
-movu          [r0 + 384], m2
-
-mova         m2,          [tab_Zero]
-
-pshufb       m3,          m1,       m2
-punpcklbw    m3,          m2
-
-movd         m1,          [r2]
-
-pshufb       m1,          m2
-punpcklbw    m1,          m2
-
-movd         m4,          [r2 + 1]
-punpcklbw    m4,          m2
-
-psubw        m4,          m1
-psraw        m4,          1
-
-paddw        m3,          m4
-
-packuswb     m3,          m2
-
-pextrb       [r0 + 384],  m3,    0
-pextrb       [r0 + 388],  m3,    1
-pextrb       [r0 + 392],  m3,    2
-pextrb       [r0 + 396],  m3,    3
-
-; mode 27
-
-movh          m1,         [r1 + 1]
-
-palignr       m2,         m1,     1
-punpcklbw     m1,         m2
-
-pmaddubsw     m2,         m1,     [tab_30_2]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 400], m2
-
-pmaddubsw     m2,         m1,     [tab_28_4]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 404], m2
-
-pmaddubsw     m2,         m1,     [tab_26_6]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 408], m2
-
-pmaddubsw     m2,         m1,     [tab_24_8]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 412], m2
-
-; mode 28
-
-pmaddubsw     m2,         m1,     [tab_27_5]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 416], m2
-
-pmaddubsw     m2,         m1,     [tab_22_10]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 420], m2
-
-pmaddubsw     m2,         m1,     [tab_17_15]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 424], m2
-
-pmaddubsw     m2,         m1,     [tab_12_20]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 428], m2
-
-; mode 29
-
-pmaddubsw     m2,         m1,     [tab_23_9]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 432], m2
-
-pmaddubsw     m2,         m1,     [tab_14_18]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 436], m2
-
-pmaddubsw     m2,         m1,     [tab_5_27]
-pmulhrsw      m2,         m0
-packuswb      m2,         m2
-movd          [r0 + 440], m2
-
-movh          m2,         [r1 + 2]
-
-palignr       m3,         m2,     1
-punpcklbw     m2,         m3
-
-pmaddubsw     m3,         m2,     [tab_28_4]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 444], m3
-
-; mode 30
-
-pmaddubsw     m3,         m1,     [tab_19_13]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 448], m3
-
-pmaddubsw     m6,         m1,     [tab_6_26]
-pmulhrsw      m6,         m0
-packuswb      m6,         m6
-movd          [r0 + 452], m6
-
-pmaddubsw     m3,         m2,     [tab_25_7]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 456], m3
-
-pmaddubsw     m5,         m2,     [tab_12_20]
-pmulhrsw      m5,         m0
-packuswb      m5,         m5
-movd          [r0 + 460], m5
-
-; mode 31
-
-pmaddubsw     m3,         m1,     [tab_15_17]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 464], m3
-
-pmaddubsw     m3,         m2,     [tab_30_2]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 468], m3
-
-pmaddubsw     m3,         m2,     [tab_13_19]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 472], m3
-
-movh          m3,         [r1 + 3]
-
-palignr       m4,         m3,     1
-punpcklbw     m3,         m4
-
-pmaddubsw     m4,         m3,     [tab_28_4]
-pmulhrsw      m4,         m0
-packuswb      m4,         m4
-movd          [r0 + 476], m4
-
-; mode 32
-
-pmaddubsw     m4,         m1,     [tab_11_21]
-pmulhrsw      m4,         m0
-packuswb      m4,         m4
-movd          [r0 + 480], m4
-
-pmaddubsw     m4,         m2,     [tab_22_10]
-pmulhrsw      m4,         m0
-packuswb      m4,         m4
-movd          [r0 + 484], m4
-
-pmaddubsw     m4,         m2,     [tab_1_31]
-pmulhrsw      m4,         m0
-packuswb      m4,         m4
-movd          [r0 + 488], m4
-
-pmaddubsw     m4,         m3,     [tab_12_20]
-pmulhrsw      m4,         m0
-packuswb      m4,         m4
-movd          [r0 + 492], m4
-
-; mode 33
-
-movd          [r0 + 496], m6
-
-movd          [r0 + 500], m5
-
-pmaddubsw     m3,         [tab_18_14]
-pmulhrsw      m3,         m0
-packuswb      m3,         m3
-movd          [r0 + 504], m3
-
-movh          m1,         [r1 + 4]
-
-palignr       m2,         m1,     1
-punpcklbw     m1,         m2
-
-pmaddubsw     m1,         [tab_24_8]
-pmulhrsw      m1,         m0
-packuswb      m1,         m1
-movd          [r0 + 508], m1
-
-; mode 34
-
-movd         m0,          [r1 + 2]
-movd         [r0 + 512],  m0
-
-movd         m0,          [r1 + 3]
-movd         [r0 + 516],  m0
-
-movd         m0,          [r1 + 4]
-movd         [r0 + 520],  m0
-
-movd         m0,          [r1 + 5]
-movd         [r0 + 524],  m0
-
-RET
diff -r d18c574e0ce9 -r a9257e7012d7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 03 14:51:09 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 03 18:27:27 2013 +0530
@@ -35,7 +35,6 @@
 #include "blockcopy8.h"
 #include "intrapred.h"
 #include "dct8.h"
-#include "allangs-pred.h"
 }
 
 #define INIT2_NAME(name1, name2, cpu) \
diff -r d18c574e0ce9 -r a9257e7012d7 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Tue Dec 03 14:51:09 2013 +0530
+++ b/source/common/x86/intrapred.h	Tue Dec 03 18:27:27 2013 +0530
@@ -36,4 +36,6 @@
 void x265_intra_pred_planar16_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride);
 void x265_intra_pred_planar32_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride);
 
+void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
+
 #endif // ifndef X265_INTRAPRED_H
diff -r d18c574e0ce9 -r a9257e7012d7 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Tue Dec 03 14:51:09 2013 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Dec 03 18:27:27 2013 +0530
@@ -32,6 +32,41 @@
 multiH2:    dw 17, 18, 19, 20, 21, 22, 23, 24
 multiH3:    dw 25, 26, 27, 28, 29, 30, 31, 32
 
+tab_6_26:  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 0, 0
+tab_12_20: db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+tab_18_14: db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+tab_24_8:  db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+tab_11_21: db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21
+tab_22_10: db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+tab_1_31:  db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31
+tab_15_17: db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17
+tab_30_2:  db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
+tab_13_19: db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19
+tab_28_4:  db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+tab_19_13: db 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
+tab_25_7:  db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7
+tab_23_9:  db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9
+tab_14_18: db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+tab_5_27:  db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27
+tab_27_5:  db 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
+tab_17_15: db 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
+tab_26_6:  db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+tab_2_30:  db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+tab_4_28:  db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+tab_8_24:  db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+tab_10_22: db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+tab_20_12: db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+tab_9_23:  db 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
+tab_7_25:  db 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
+tab_21_11: db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
+tab_31_1:  db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
+
+pw_1024:   dw 1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024
+
+tab_Si0:  db 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0, 4, 2, 1, 0
+tab_Si1:  db 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
 SECTION .text
 
 cextern pw_8
@@ -674,3 +709,860 @@
 %undef COMP_PRED_PLANAR_ROW
 
     RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma
+
+; mode 2
+
+movd     m0,        [r2 + 2]
+movd     m1,        [r2 + 3]
+movd     m2,        [r2 + 4]
+movd     m3,        [r2 + 5]
+
+movd     [r0],      m0
+movd     [r0 + 4],  m1
+movd     [r0 + 8],  m2
+movd     [r0 + 12], m3
+
+; mode 3
+
+mova          m0,        [pw_1024]
+
+movu          m1,        [r2 + 1]
+
+palignr       m2,        m1,        1
+punpcklbw     m1,        m2
+
+pmaddubsw     m7,        m1,        [tab_6_26]
+pmulhrsw      m7,        m0
+packuswb      m7,        m7
+movd          [r0 + 16], m7
+
+movu          m2,        [r2 + 2]
+
+palignr       m3,        m2,        1
+punpcklbw     m2,        m3
+
+pmaddubsw     m6,        m2,        [tab_12_20]
+pmulhrsw      m6,        m0
+packuswb      m6,        m6
+movd          [r0 + 20], m6
+
+movu          m3,        [r2 + 3]
+
+palignr       m4,        m3,        1
+punpcklbw     m3,        m4
+
+pmaddubsw     m4,        m3,        [tab_18_14]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 24], m4
+
+movu          m4,        [r2 + 4]
+
+palignr       m5,        m4,        1
+punpcklbw     m4,        m5
+
+pmaddubsw     m4,        [tab_24_8]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 28], m4
+
+; mode 4
+
+pmaddubsw     m4,        m1,        [tab_11_21]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 32], m4
+
+pmaddubsw     m4,        m2,        [tab_22_10]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 36], m4
+
+pmaddubsw     m4,        m2,        [tab_1_31]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 40], m4
+
+pmaddubsw     m4,        m3,        [tab_12_20]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 44], m4
+
+; mode 5
+
+pmaddubsw     m4,        m1,        [tab_15_17]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 48], m4
+
+pmaddubsw     m4,        m2,        [tab_30_2]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 52], m4
+
+pmaddubsw     m4,        m2,        [tab_13_19]
+pmulhrsw      m4,        m0
+packuswb      m4,        m4
+movd          [r0 + 56], m4
+
+pmaddubsw     m3,        [tab_28_4]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 60], m3
+
+; mode 6
+
+pmaddubsw     m3,        m1,        [tab_19_13]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 64], m3
+
+movd          [r0 + 68], m7
+
+pmaddubsw     m3,        m2,        [tab_25_7]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 72], m3
+
+movd          [r0 + 76], m6
+
+; mode 7
+
+pmaddubsw     m3,        m1,        [tab_23_9]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 80], m3
+
+pmaddubsw     m3,        m1,        [tab_14_18]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 84], m3
+
+pmaddubsw     m3,        m1,        [tab_5_27]
+pmulhrsw      m3,        m0
+packuswb      m3,        m3
+movd          [r0 + 88], m3
+
+pmaddubsw     m2,        [tab_28_4]
+pmulhrsw      m2,        m0
+packuswb      m2,        m2
+movd          [r0 + 92], m2
+
+; mode 8
+
+pmaddubsw     m2,         m1,       [tab_27_5]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 96],  m2
+
+pmaddubsw     m2,         m1,       [tab_22_10]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 100], m2
+
+pmaddubsw     m2,         m1,       [tab_17_15]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 104], m2
+
+pmaddubsw     m2,         m1,       [tab_12_20]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 108], m2
+
+; mode 9
+
+pmaddubsw     m2,         m1,       [tab_30_2]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 112], m2
+
+pmaddubsw     m2,         m1,       [tab_28_4]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 116], m2
+
+pmaddubsw     m2,         m1,       [tab_26_6]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 120], m2
+
+pmaddubsw     m1,         [tab_24_8]
+pmulhrsw      m1,         m0
+packuswb      m1,         m1
+movd          [r0 + 124], m1
+
+; mode 10
+
+movd          m1,         [r2 + 1]
+pshufd        m2,         m1,        0
+movu          [r0 + 128], m2
+
+mova         m2,          [tab_Zero]
+
+pshufb       m3,          m1,       m2
+punpcklbw    m3,          m2
+
+movd         m1,          [r1]
+
+pshufb       m1,          m2
+punpcklbw    m1,          m2
+
+movd         m4,          [r1 + 1]
+punpcklbw    m4,          m2
+
+psubw        m4,          m1
+psraw        m4,          1
+
+paddw        m3,          m4
+
+packuswb     m3,          m2
+
+pextrb       [r0 + 128],  m3,    0
+pextrb       [r0 + 132],  m3,    1
+pextrb       [r0 + 136],  m3,    2
+pextrb       [r0 + 140],  m3,    3
+
+; mode 11
+
+movu          m1,         [r2]
+
+palignr       m2,         m1,        1
+punpcklbw     m1,         m2
+
+pmaddubsw     m2,         m1,        [tab_2_30]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 144], m2
+
+pmaddubsw     m2,         m1,        [tab_4_28]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 148], m2
+
+pmaddubsw     m2,         m1,        [tab_6_26]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 152], m2
+
+pmaddubsw     m2,         m1,        [tab_8_24]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 156], m2
+
+; mode 12
+
+pmaddubsw     m2,         m1,        [tab_5_27]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 160], m2
+
+pmaddubsw     m2,         m1,        [tab_10_22]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 164], m2
+
+pmaddubsw     m2,         m1,        [tab_15_17]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 168], m2
+
+pmaddubsw     m2,         m1,        [tab_20_12]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 172], m2
+
+; mode 13
+
+pmaddubsw     m2,         m1,        [tab_9_23]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 176], m2
+
+pmaddubsw     m2,         m1,        [tab_18_14]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 180], m2
+
+pmaddubsw     m2,         m1,        [tab_27_5]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 184], m2
+
+movh          m2,         [r2 - 1]
+pinsrb        m2,         [r1 + 4],    0
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m2,         [tab_4_28]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 188], m2
+
+; mode 14
+
+pmaddubsw     m2,         m1,        [tab_13_19]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 192], m2
+
+pmaddubsw     m5,         m1,        [tab_26_6]
+pmulhrsw      m5,         m0
+packuswb      m5,         m5
+movd          [r0 + 196], m5
+
+movh          m2,         [r2 - 1]
+pinsrb        m2,         [r1 + 2],    0
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m3,         m2,        [tab_7_25]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 200], m3
+
+pmaddubsw     m3,         m2,        [tab_20_12]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 204], m3
+
+; mode 15
+
+pmaddubsw     m3,         m1,        [tab_17_15]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 208], m3
+
+pmaddubsw     m3,         m2,        [tab_2_30]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 212], m3
+
+pmaddubsw     m3,         m2,        [tab_19_13]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 216], m3
+
+movh          m3,         [r2 - 2]
+pinsrb        m3,         [r1 + 4],    0
+pinsrb        m3,         [r1 + 2],    1
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_4_28]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 220], m3
+
+; mode 16
+
+pmaddubsw     m3,         m1,        [tab_21_11]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 224], m3
+
+pmaddubsw     m3,         m2,        [tab_10_22]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 228], m3
+
+pmaddubsw     m3,         m2,        [tab_31_1]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 232], m3
+
+movh          m3,         [r2 - 2]
+pinsrb        m3,         [r1 + 3],  0
+pinsrb        m3,         [r1 + 2],  1
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_20_12]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 236], m3
+
+; mode 17
+
+movd          [r0 + 240], m5
+
+movh          m3,         [r2 - 1]
+pinsrb        m3,         [r1 + 1],  0
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_20_12]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 244], m3
+
+movh          m3,         [r2 - 2]
+pinsrb        m3,         [r1 + 2],  0
+pinsrb        m3,         [r1 + 1],  1
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_14_18]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 248], m3
+
+movh          m3,         [r1]
+pshufb        m3,         [tab_Si0]
+pinsrb        m3,         [r2 + 1],   4
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_8_24]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 252], m3
+
+; mode 18
+
+movd          m3,         [r1]
+movd          [r0 + 256], m3
+
+movh          m3,         [r1 - 1]
+pinsrb        m3,         [r2 + 1],  0
+movd          [r0 + 260], m3
+
+movh          m3,         [r1 - 2]
+pinsrb        m3,         [r2 + 2],  0
+pinsrb        m3,         [r2 + 1],  1
+movd          [r0 + 264], m3
+
+movh          m3,         [r2]
+pshufb        m3,         [tab_Si1]
+pinsrb        m3,         [r1],       3
+movd          [r0 + 268], m3
+
+; mode 19
+
+movh          m1,         [r1]
+
+palignr       m2,         m1,        1
+punpcklbw     m1,         m2
+
+pmaddubsw     m5,         m1,        [tab_26_6]
+pmulhrsw      m5,         m0
+packuswb      m5,         m5
+movd          [r0 + 272], m5
+
+movh          m2,         [r1 - 1]
+pinsrb        m2,         [r2 + 1],  0
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m2,         [tab_20_12]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 276], m2
+
+movh          m2,         [r1 - 2]
+pinsrb        m2,         [r2 + 2],  0
+pinsrb        m2,         [r2 + 1],  1
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m2,         [tab_14_18]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 280], m2
+
+movh          m2,         [r2]
+pshufb        m2,         [tab_Si0]
+pinsrb        m2,         [r1 + 1],  4
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m2,         [tab_8_24]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 284], m2
+
+; mode 20
+
+pmaddubsw     m2,         m1,        [tab_21_11]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 288], m2
+
+movh          m2,         [r1 - 1]
+pinsrb        m2,         [r2 + 2],  0
+
+palignr       m3,         m2,        1
+punpcklbw     m2,         m3
+
+pmaddubsw     m3,         m2,         [tab_10_22]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 292], m3
+
+pmaddubsw     m3,         m2,         [tab_31_1]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 296], m3
+
+movh          m3,         [r1 - 2]
+pinsrb        m3,         [r2 + 3],  0
+pinsrb        m3,         [r2 + 2],  1
+
+palignr       m4,         m3,        1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_20_12]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 300], m3
+
+; mode 21
+
+pmaddubsw     m3,         m1,         [tab_17_15]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 304], m3
+
+pmaddubsw     m3,         m2,         [tab_2_30]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 308], m3
+
+pmaddubsw     m3,         m2,         [tab_19_13]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 312], m3
+
+movh          m3,         [r1 - 2]
+pinsrb        m3,         [r2 + 4],   0
+pinsrb        m3,         [r2 + 2],   1
+
+palignr       m4,         m3,         1
+punpcklbw     m3,         m4
+
+pmaddubsw     m3,         [tab_4_28]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 316], m3
+
+; mode 22
+
+pmaddubsw     m3,         m1,         [tab_13_19]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 320], m3
+
+movd          [r0 + 324], m5
+
+pmaddubsw     m3,         m2,         [tab_7_25]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 328], m3
+
+pmaddubsw     m3,         m2,         [tab_20_12]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 332], m3
+
+; mode 23
+
+pmaddubsw     m2,         m1,         [tab_9_23]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 336], m2
+
+pmaddubsw     m2,         m1,         [tab_18_14]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 340], m2
+
+pmaddubsw     m2,         m1,         [tab_27_5]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 344], m2
+
+movh          m2,         [r1 - 1]
+pinsrb        m2,         [r2 + 4],   0
+
+palignr       m3,         m2,         1
+punpcklbw     m2,         m3
+
+pmaddubsw     m2,         [tab_4_28]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 348], m2
+
+; mode 24
+
+pmaddubsw     m2,         m1,         [tab_5_27]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 352], m2
+
+pmaddubsw     m2,         m1,         [tab_10_22]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 356], m2
+
+pmaddubsw     m2,         m1,         [tab_15_17]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 360], m2
+
+pmaddubsw     m2,         m1,         [tab_20_12]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 364], m2
+
+; mode 25
+
+pmaddubsw     m2,         m1,         [tab_2_30]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 368], m2
+
+pmaddubsw     m2,         m1,         [tab_4_28]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 372], m2
+
+pmaddubsw     m2,         m1,         [tab_6_26]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 376], m2
+
+pmaddubsw     m2,         m1,         [tab_8_24]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 380], m2
+
+; mode 26
+
+movd          m1,         [r1 + 1]
+pshufd        m2,         m1,        0
+movu          [r0 + 384], m2
+
+mova         m2,          [tab_Zero]
+
+pshufb       m3,          m1,       m2
+punpcklbw    m3,          m2
+
+movd         m1,          [r2]
+
+pshufb       m1,          m2
+punpcklbw    m1,          m2
+
+movd         m4,          [r2 + 1]
+punpcklbw    m4,          m2
+
+psubw        m4,          m1
+psraw        m4,          1
+
+paddw        m3,          m4
+
+packuswb     m3,          m2
+
+pextrb       [r0 + 384],  m3,    0
+pextrb       [r0 + 388],  m3,    1
+pextrb       [r0 + 392],  m3,    2
+pextrb       [r0 + 396],  m3,    3
+
+; mode 27
+
+movh          m1,         [r1 + 1]
+
+palignr       m2,         m1,     1
+punpcklbw     m1,         m2
+
+pmaddubsw     m2,         m1,     [tab_30_2]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 400], m2
+
+pmaddubsw     m2,         m1,     [tab_28_4]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 404], m2
+
+pmaddubsw     m2,         m1,     [tab_26_6]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 408], m2
+
+pmaddubsw     m2,         m1,     [tab_24_8]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 412], m2
+
+; mode 28
+
+pmaddubsw     m2,         m1,     [tab_27_5]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 416], m2
+
+pmaddubsw     m2,         m1,     [tab_22_10]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 420], m2
+
+pmaddubsw     m2,         m1,     [tab_17_15]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 424], m2
+
+pmaddubsw     m2,         m1,     [tab_12_20]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 428], m2
+
+; mode 29
+
+pmaddubsw     m2,         m1,     [tab_23_9]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 432], m2
+
+pmaddubsw     m2,         m1,     [tab_14_18]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 436], m2
+
+pmaddubsw     m2,         m1,     [tab_5_27]
+pmulhrsw      m2,         m0
+packuswb      m2,         m2
+movd          [r0 + 440], m2
+
+movh          m2,         [r1 + 2]
+
+palignr       m3,         m2,     1
+punpcklbw     m2,         m3
+
+pmaddubsw     m3,         m2,     [tab_28_4]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 444], m3
+
+; mode 30
+
+pmaddubsw     m3,         m1,     [tab_19_13]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 448], m3
+
+pmaddubsw     m6,         m1,     [tab_6_26]
+pmulhrsw      m6,         m0
+packuswb      m6,         m6
+movd          [r0 + 452], m6
+
+pmaddubsw     m3,         m2,     [tab_25_7]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 456], m3
+
+pmaddubsw     m5,         m2,     [tab_12_20]
+pmulhrsw      m5,         m0
+packuswb      m5,         m5
+movd          [r0 + 460], m5
+
+; mode 31
+
+pmaddubsw     m3,         m1,     [tab_15_17]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 464], m3
+
+pmaddubsw     m3,         m2,     [tab_30_2]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 468], m3
+
+pmaddubsw     m3,         m2,     [tab_13_19]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 472], m3
+
+movh          m3,         [r1 + 3]
+
+palignr       m4,         m3,     1
+punpcklbw     m3,         m4
+
+pmaddubsw     m4,         m3,     [tab_28_4]
+pmulhrsw      m4,         m0
+packuswb      m4,         m4
+movd          [r0 + 476], m4
+
+; mode 32
+
+pmaddubsw     m4,         m1,     [tab_11_21]
+pmulhrsw      m4,         m0
+packuswb      m4,         m4
+movd          [r0 + 480], m4
+
+pmaddubsw     m4,         m2,     [tab_22_10]
+pmulhrsw      m4,         m0
+packuswb      m4,         m4
+movd          [r0 + 484], m4
+
+pmaddubsw     m4,         m2,     [tab_1_31]
+pmulhrsw      m4,         m0
+packuswb      m4,         m4
+movd          [r0 + 488], m4
+
+pmaddubsw     m4,         m3,     [tab_12_20]
+pmulhrsw      m4,         m0
+packuswb      m4,         m4
+movd          [r0 + 492], m4
+
+; mode 33
+
+movd          [r0 + 496], m6
+
+movd          [r0 + 500], m5
+
+pmaddubsw     m3,         [tab_18_14]
+pmulhrsw      m3,         m0
+packuswb      m3,         m3
+movd          [r0 + 504], m3
+
+movh          m1,         [r1 + 4]
+
+palignr       m2,         m1,     1
+punpcklbw     m1,         m2
+
+pmaddubsw     m1,         [tab_24_8]
+pmulhrsw      m1,         m0
+packuswb      m1,         m1
+movd          [r0 + 508], m1
+
+; mode 34
+
+movd         m0,          [r1 + 2]
+movd         [r0 + 512],  m0
+
+movd         m0,          [r1 + 3]
+movd         [r0 + 516],  m0
+
+movd         m0,          [r1 + 4]
+movd         [r0 + 520],  m0
+
+movd         m0,          [r1 + 5]
+movd         [r0 + 524],  m0
+
+RET


More information about the x265-devel mailing list