[x265] [PATCH 2 of 2] asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 asm

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Sat Jan 9 11:24:51 CET 2016


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1449813841 -19800
#      Fri Dec 11 11:34:01 2015 +0530
# Node ID ee47dd944e08ebb49fd54114979c65dadabfe0df
# Parent  593a1907e915c9bad7bd3ff608a30770289c249a
asm: improved intra_ang8x8 modes 3 to 17 AVX2 asm over 20% than previous AVX2 asm

diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat Dec 12 09:56:10 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Dec 11 11:34:01 2015 +0530
@@ -2932,6 +2932,7 @@
         p.cu[BLOCK_8x8].intra_pred[14] = PFX(intra_pred_ang8_14_avx2);
         p.cu[BLOCK_8x8].intra_pred[15] = PFX(intra_pred_ang8_15_avx2);
         p.cu[BLOCK_8x8].intra_pred[16] = PFX(intra_pred_ang8_16_avx2);
+        p.cu[BLOCK_8x8].intra_pred[17] = PFX(intra_pred_ang8_17_avx2);
         p.cu[BLOCK_8x8].intra_pred[20] = PFX(intra_pred_ang8_20_avx2);
         p.cu[BLOCK_8x8].intra_pred[21] = PFX(intra_pred_ang8_21_avx2);
         p.cu[BLOCK_8x8].intra_pred[22] = PFX(intra_pred_ang8_22_avx2);
diff -r 593a1907e915 -r ee47dd944e08 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Sat Dec 12 09:56:10 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Dec 11 11:34:01 2015 +0530
@@ -355,55 +355,55 @@
                             times 8 db (32-22), 22
                             times 8 db (32-11), 11
 
-const ang16_shuf_mode9,    times 8 db 0, 1
-                           times 8 db 1, 2
-
-const angHor_tab_9,  db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
-                     db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
-
-const angHor_tab_11, db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
-                     db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
-
-const ang16_shuf_mode12,   db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
-                           db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
-
-const angHor_tab_12, db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
-                     db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
-
-const ang16_shuf_mode13,   db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
-                           db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
-
-const angHor_tab_13, db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
-                     db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
-
-const ang16_shuf_mode14,   db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
-                           db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
-
-const angHor_tab_14, db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
-                     db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
-
-const ang16_shuf_mode15,   db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
-                           db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
-                           db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
-
-const angHor_tab_15, db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
-                     db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
-
-const ang16_shuf_mode16,   db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
-                           db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
-                           db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
-
-const angHor_tab_16, db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
-                     db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
-
-const ang16_shuf_mode17,   db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
-                           db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
-                           db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
-
-const angHor_tab_17, db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
-                     db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
+const ang16_shuf_mode9,     times 8 db 0, 1
+                            times 8 db 1, 2
+
+const angHor_tab_9,         db (32-2), 2, (32-4), 4, (32-6), 6, (32-8), 8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
+                            db (32-18), 18, (32-20), 20, (32-22), 22, (32-24),  24, (32-26),  26, (32-28), 28, (32-30), 30, (32-32), 32
+
+const angHor_tab_11,        db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
+                            db (32-14), 14, (32-12), 12, (32-10), 10, (32- 8),  8, (32- 6),  6, (32- 4),  4, (32- 2),  2, (32- 0),  0
+
+const ang16_shuf_mode12,    db 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3
+                            db 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2
+
+const angHor_tab_12,        db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32-7), 7, (32-2), 2, (32-29), 29, (32-24), 24
+                            db (32-19), 19, (32-14), 14, (32-9), 9, (32-4), 4, (32-31), 31, (32-26),  26, (32-21), 21, (32-16), 16
+
+const ang16_shuf_mode13,    db 4, 5, 4, 5, 4, 5, 3, 4, 3, 4, 3, 4, 3, 4, 2, 3, 5, 6, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 4, 5, 3, 4
+                            db 2, 3, 2, 3, 1, 2, 1, 2, 1, 2, 1, 2, 0, 1, 0, 1, 3, 4, 3, 4, 2, 3, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0, 0 ,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 11, 7, 4, 0
+
+const angHor_tab_13,        db (32-23), 23, (32-14), 14, (32-5), 5, (32-28), 28, (32-19), 19, (32-10), 10, (32-1), 1, (32-24), 24
+                            db (32-15), 15, (32-6), 6, (32-29), 29, (32-20), 20, (32-11), 11, (32-2), 2, (32-25), 25, (32-16), 16
+
+const ang16_shuf_mode14,    db 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 4, 5, 4, 5, 3, 4, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 5, 6, 4, 5
+                            db 3, 4, 2, 3, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 0, 1, 4, 5, 3, 4, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 12, 10, 7, 5, 2, 0
+
+const angHor_tab_14,        db (32-19), 19, (32-6), 6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32-5), 5, (32-24), 24
+                            db (32-11), 11, (32-30), 30, (32-17), 17, (32-4), 4, (32-23), 23, (32-10), 10, (32-29), 29, (32-16), 16
+
+const ang16_shuf_mode15,    db 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6, 5, 6, 4, 5, 9, 10, 8, 9, 8, 9, 7, 8, 7, 8, 6, 7, 6, 7, 5, 6
+                            db 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2, 1, 2, 0, 1, 5, 6, 4, 5, 4, 5, 3, 4, 3, 4, 2, 3, 2, 3, 1, 2
+                            db 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 8, 6, 4, 2, 0
+
+const angHor_tab_15,        db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32-9), 9, (32-24), 24
+                            db (32-7), 7, (32-22), 22, (32-5), 5, (32-20), 20, (32-3), 3, (32-18), 18, (32-1), 1, (32- 16), 16
+
+const ang16_shuf_mode16,    db 10, 11, 9, 10, 9, 10, 8, 9, 7, 8, 7, 8, 6, 7, 5, 6, 11, 12, 10, 11, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7
+                            db 5, 6, 4, 5, 3, 4, 3, 4, 2, 3, 1, 2, 1, 2, 0, 1, 6, 7, 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 2, 3, 1, 2
+                            db 0 ,0, 0, 0, 0, 15, 14, 12 , 11, 9, 8, 6, 5, 3, 2, 0, 0, 0, 0, 0, 0, 15, 14, 12, 11, 9, 8, 6, 5, 3, 2, 0
+
+const angHor_tab_16,        db (32-11), 11, (32-22), 22, (32-1), 1, (32-12), 12, (32-23), 23, (32-2), 2, (32-13), 13, (32-24), 24
+                            db (32-3), 3, (32-14), 14, (32-25), 25, (32-4), 4, (32-15), 15, (32-26), 26, (32-5), 5, (32-16), 16
+
+const ang16_shuf_mode17,    db 12, 13, 11, 12, 10, 11, 9, 10, 8, 9, 8, 9, 7, 8, 6, 7, 13, 14, 12, 13, 11, 12, 10, 11, 9, 10, 9, 10, 8, 9, 7, 8
+                            db 5, 6, 4, 5, 4, 5, 3, 4, 2, 3, 1, 2, 0, 1, 0, 1, 6, 7, 5, 6, 5, 6, 4, 5, 3, 4, 2, 3, 1, 2, 1, 2
+                            db 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0, 0, 0, 0, 15, 14, 12, 11, 10, 9, 7, 6, 5, 4, 2, 1, 0
+
+const angHor_tab_17,        db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
+                            db (32-22), 22, (32-28), 28, (32- 2),  2, (32- 8),  8, (32-14), 14, (32-20), 20, (32-26), 26, (32- 0),  0
 
 ; Intrapred_angle32x32, modes 1 to 33 constants
 const ang32_shuf_mode9,         times 8 db 0, 1
@@ -467,6 +467,39 @@
                                 dd  0,  0,  2,  3,  0,  0,  7,  1
                                 dd  0,  0,  5,  6,  0,  0,  0,  0
 
+; Intrapred_angle8x8, modes 1 to 33 constants
+const ang8_shuf_mode3,          db  0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  5,  6,  6,  7,  7,  8
+const ang8_shuf_mode4,          db  0,  1,  1,  2,  1,  2,  2,  3,  3,  4,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  2,  3,  3,  4,  4,  5,  4,  5,  5,  6,  6,  7
+const ang8_shuf_mode5,          db  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  4,  5,  4,  5,  5,  6
+const ang8_shuf_mode6,          db  0,  1,  0,  1,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  3,  4,  1,  2,  1,  2,  2,  3,  2,  3,  3,  4,  3,  4,  3,  4,  4,  5
+const ang8_shuf_mode7,          db  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3,  2,  3,  2,  3,  3,  4
+const ang8_shuf_mode8,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  2,  3,  2,  3
+const ang8_shuf_mode9,          db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2,  1,  2
+const ang8_shuf_mode12,         db  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  6,  7,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8,  7,  8
+const ang8_shuf_mode13,         db  8,  9,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  7,  8,  6,  7,  9, 10,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  8,  9,  7,  8
+const ang8_shuf_mode14,         db  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  7,  8,  6,  7, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  8,  9,  7,  8
+const ang8_shuf_mode15,         db 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8,  7,  8,  6,  7, 11, 12, 10, 11, 10, 11,  9, 10,  9, 10,  8,  9,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  4,  2,  0
+const ang8_shuf_mode16,         db 11, 12, 10, 11, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 12, 13, 11, 12, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  6,  5,  3,  2,  0
+const ang8_shuf_mode17,         db 12, 13, 11, 12, 10, 11,  9, 10,  8,  9,  8,  9,  7,  8,  6,  7, 13, 14, 12, 13, 11, 12, 10, 11,  9, 10,  9, 10,  8,  9,  7,  8
+                                db  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  7,  6,  5,  4,  2,  1,  0
+
+const ang8_fact_mode3,          db (32-26), 26, (32-20), 20, (32-14), 14, (32- 8),  8, (32- 2),  2, (32-28), 28, (32-22), 22, (32-16), 16
+const ang8_fact_mode4,          db (32-21), 21, (32-10), 10, (32-31), 31, (32-20), 20, (32- 9),  9, (32-30), 30, (32-19), 19, (32- 8),  8
+const ang8_fact_mode5,          db (32-17), 17, (32- 2),  2, (32-19), 19, (32- 4),  4, (32-21), 21, (32- 6),  6, (32-23), 23, (32- 8),  8
+const ang8_fact_mode6,          db (32-13), 13, (32-26), 26, (32- 7),  7, (32-20), 20, (32- 1),  1, (32-14), 14, (32-27), 27, (32- 8),  8
+const ang8_fact_mode7,          db (32- 9),  9, (32-18), 18, (32-27), 27, (32- 4),  4, (32-13), 13, (32-22), 22, (32-31), 31, (32- 8),  8
+const ang8_fact_mode8,          db (32- 5),  5, (32-10), 10, (32-15), 15, (32-20), 20, (32-25), 25, (32-30), 30, (32- 3),  3, (32- 8),  8
+const ang8_fact_mode9,          db (32- 2),  2, (32- 4),  4, (32- 6),  6, (32- 8),  8, (32-10), 10, (32-12), 12, (32-14), 14, (32-16), 16
+const ang8_fact_mode11,         db (32-30), 30, (32-28), 28, (32-26), 26, (32-24), 24, (32-22), 22, (32-20), 20, (32-18), 18, (32-16), 16
+const ang8_fact_mode12,         db (32-27), 27, (32-22), 22, (32-17), 17, (32-12), 12, (32- 7),  7, (32- 2),  2, (32-29), 29, (32-24), 24
+const ang8_fact_mode13,         db (32-23), 23, (32-14), 14, (32- 5),  5, (32-28), 28, (32-19), 19, (32-10), 10, (32- 1),  1, (32-24), 24
+const ang8_fact_mode14,         db (32-19), 19, (32- 6),  6, (32-25), 25, (32-12), 12, (32-31), 31, (32-18), 18, (32- 5),  5, (32-24), 24
+const ang8_fact_mode15,         db (32-15), 15, (32-30), 30, (32-13), 13, (32-28), 28, (32-11), 11, (32-26), 26, (32- 9),  9, (32-24), 24
+const ang8_fact_mode16,         db (32-11), 11, (32-22), 22, (32- 1),  1, (32-12), 12, (32-23), 23, (32- 2),  2, (32-13), 13, (32-24), 24
+const ang8_fact_mode17,         db (32- 6),  6, (32-12), 12, (32-18), 18, (32-24), 24, (32-30), 30, (32- 4),  4, (32-10), 10, (32-16), 16
+
 const ang_table
 %assign x 0
 %rep 32
@@ -490,6 +523,7 @@
 
 SECTION .text
 cextern pb_1
+cextern pb_2
 cextern pw_2
 cextern pw_3
 cextern pw_4
@@ -18582,48 +18616,48 @@
 ; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
 ;-----------------------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal intra_pred_ang8_3, 3,4,5
+%macro ang8_store8x8 0
+    lea               r3, [3 * r1]
+    vextracti128      xm2, m1, 1
+    vextracti128      xm5, m4, 1
+    movq              [r0], xm1
+    movq              [r0 + r1], xm2
+    movhps            [r0 + 2 * r1], xm1
+    movhps            [r0 + r3], xm2
+    lea               r0, [r0 + 4 * r1]
+    movq              [r0], xm4
+    movq              [r0 + r1], xm5
+    movhps            [r0 + 2 * r1], xm4
+    movhps            [r0 + r3], xm5
+%endmacro
+
+cglobal intra_pred_ang8_3, 3,4,6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode3]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode3]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
-    pshufb            m2, m0, [c_ang8_src3_11_4_12]
-    pshufb            m4, m0, [c_ang8_src5_13_5_13]
-    pshufb            m0,     [c_ang8_src6_14_7_15]
-
-    pmaddubsw         m1, [c_ang8_26_20]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_14_8]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_2_28]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_22_16]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -18662,48 +18696,33 @@
     RET
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_4, 3,4,5
+cglobal intra_pred_ang8_4, 3,4,6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode4]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode4]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
-    pshufb            m2, m0, [c_ang8_src2_10_3_11]
-    pshufb            m4, m0, [c_ang8_src4_12_4_12]
-    pshufb            m0,     [c_ang8_src5_13_6_14]
-
-    pmaddubsw         m1, [c_ang8_21_10]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_31_20]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_9_30]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_19_8]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -18743,48 +18762,33 @@
 
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_5, 3, 4, 5
+cglobal intra_pred_ang8_5, 3, 4, 6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode5]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode5]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m1, m0, [c_ang8_src1_9_2_10]
-    pshufb            m2, m0, [c_ang8_src2_10_3_11]
-    pshufb            m4, m0, [c_ang8_src3_11_4_12]
-    pshufb            m0,     [c_ang8_src4_12_5_13]
-
-    pmaddubsw         m1, [c_ang8_17_2]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_19_4]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_21_6]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_23_8]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -18824,48 +18828,33 @@
 
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_6, 3, 4, 5
+cglobal intra_pred_ang8_6, 3, 4, 6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode6]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode6]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m1, m0, [intra_pred_shuff_0_8]
-    pshufb            m2, m0, [c_ang8_src2_10_2_10]
-    pshufb            m4, m0, [c_ang8_src3_11_3_11]
-    pshufb            m0,     [c_ang8_src3_11_4_12]
-
-    pmaddubsw         m1, [c_ang8_13_26]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_7_20]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_1_14]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_27_8]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -18905,46 +18894,33 @@
 
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_9, 3, 5, 5
+cglobal intra_pred_ang8_9, 3, 5, 6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode9]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode9]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m0, [intra_pred_shuff_0_8]
-
-    lea               r4, [c_ang8_mode_27]
-    pmaddubsw         m1, m0, [r4]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -19015,48 +18991,33 @@
 
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_7, 3, 4, 5
+cglobal intra_pred_ang8_7, 3, 4, 6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode7]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode7]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-
-    pshufb            m1, m0, [intra_pred_shuff_0_8]
-    pshufb            m2, m0, [c_ang8_src1_9_2_10]
-    pshufb            m4, m0, [c_ang8_src2_10_2_10]
-    pshufb            m0,     [c_ang8_src2_10_3_11]
-
-    pmaddubsw         m1, [c_ang8_9_18]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_27_4]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_13_22]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_31_8]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -19097,48 +19058,32 @@
 
 INIT_YMM avx2
 cglobal intra_pred_ang8_8, 3, 4, 6
+    vbroadcasti128    m0, [r2 + 17]
+    mova              m5, [ang8_shuf_mode8]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode8]
     mova              m3, [pw_1024]
-    vbroadcasti128    m0, [r2 + 17]
-    mova              m5, [intra_pred_shuff_0_8]
-
-    pshufb            m1, m0, m5
-    pshufb            m2, m0, m5
-    pshufb            m4, m0, m5
-    pshufb            m0,     [c_ang8_src2_10_2_10]
-
-    pmaddubsw         m1, [c_ang8_5_10]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, [c_ang8_15_20]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, [c_ang8_25_30]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [c_ang8_3_8]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
@@ -19179,163 +19124,139 @@
 
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_11, 3, 5, 5
-    mova              m3, [pw_1024]
+cglobal intra_pred_ang8_11, 3, 5, 6
+    mova               m3, [pw_1024]
     movu              xm1, [r2 + 16]
     pinsrb            xm1, [r2], 0
-    pshufb            xm1, [intra_pred_shuff_0_8]
-    vinserti128       m0, m1, xm1, 1
-
-    lea               r4, [c_ang8_mode_25]
-    pmaddubsw         m1, m0, [r4]
+    vinserti128        m0, m1, xm1, 1
+
+    mova              m5, [ang8_shuf_mode9]
+    mova              m3, [pb_2]
+
+    pshufb            m1, m0, m5
+    paddb             m5, m3
+    pshufb            m2, m0, m5
+    paddb             m5, m3
+    pshufb            m4, m0, m5
+    paddb             m5, m3
+    pshufb            m0, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode11]
+    mova              m3, [pw_1024]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
 cglobal intra_pred_ang8_15, 3, 6, 6
+    vbroadcasti128    m1, [r2 + 17]
+    vbroadcasti128    m2, [r2]
+    mova              m3, [ang8_shuf_mode15 + mmsize]
+    pshufb            m2, m3
+    palignr           m1, m2, 11
+
+    mova              m5, [ang8_shuf_mode15]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode15]
     mova              m3, [pw_1024]
-    movu              xm5, [r2 + 16]
-    pinsrb            xm5, [r2], 0
-    lea               r5, [intra_pred_shuff_0_8]
-    mova              xm0, xm5
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 2], 0
-    vinserti128       m0, m0, xm5, 1
-    pshufb            m0, [r5]
-
-    lea               r4, [c_ang8_mode_15]
-    pmaddubsw         m1, m0, [r4]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    mova              xm0, xm5
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 4], 0
-    vinserti128       m0, m0, xm5, 1
-    pshufb            m0, [r5]
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    mova              xm0, xm5
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 6], 0
-    vinserti128       m0, m0, xm5, 1
-    pshufb            m0, [r5]
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    mova              xm0, xm5
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 8], 0
-    vinserti128       m0, m0, xm5, 1
-    pshufb            m0, [r5]
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    RET
-
-INIT_YMM avx2
-cglobal intra_pred_ang8_16, 3,4,7
-    lea                 r0, [r0 + r1 * 8]
-    sub                 r0, r1
-    neg                 r1
-    lea                 r3, [r1 * 3]
-    vbroadcasti128      m0, [angHor8_tab_16]            ; m0 = factor
-    mova                m1, [intra_pred8_shuff16]       ; m1 = 4 of Row shuffle
-    movu                m2, [intra_pred8_shuff16 + 8]   ; m2 = 4 of Row shuffle
-
-    ; prepare reference pixel
-    movq                xm3, [r2 + 16 + 1]              ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 x x x x x x x x]
-    movhps              xm3, [r2 + 2]                   ; m3 = [-1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8 x]
-    pslldq              xm3, 1
-    pinsrb              xm3, [r2], 0                    ; m3 = [ 0 -1 -2 -3 -4 -5 -6 -7 -8 2 3 x 5 6 x 8]
-    pshufb              xm3, [c_ang8_mode_16]
-    vinserti128         m3, m3, xm3, 1                  ; m3 = [-8 -7 -6 -5 -4 -3 -2 -1  0 2 3 5 6 8]
-
-    ; process 4 rows
-    pshufb              m4, m3, m1
-    pshufb              m5, m3, m2
-    psrldq              m3, 4
-    punpcklbw           m6, m5, m4
-    punpckhbw           m5, m4
-    pmaddubsw           m6, m0
-    pmulhrsw            m6, [pw_1024]
-    pmaddubsw           m5, m0
-    pmulhrsw            m5, [pw_1024]
-    packuswb            m6, m5
-    vextracti128        xm5, m6, 1
-    movq                [r0], xm6
-    movhps              [r0 + r1], xm6
-    movq                [r0 + r1 * 2], xm5
-    movhps              [r0 + r3], xm5
-
-    ; process 4 rows
-    lea                 r0, [r0 + r1 * 4]
-    pshufb              m4, m3, m1
-    pshufb              m5, m3, m2
-    punpcklbw           m6, m5, m4
-    punpckhbw           m5, m4
-    pmaddubsw           m6, m0
-    pmulhrsw            m6, [pw_1024]
-    pmaddubsw           m5, m0
-    pmulhrsw            m5, [pw_1024]
-    packuswb            m6, m5
-    vextracti128        xm5, m6, 1
-    movq                [r0], xm6
-    movhps              [r0 + r1], xm6
-    movq                [r0 + r1 * 2], xm5
-    movhps              [r0 + r3], xm5
+    ang8_store8x8
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_16, 3,4,6
+    vbroadcasti128    m1, [r2 + 17]
+    vbroadcasti128    m2, [r2]
+    mova              m3, [ang8_shuf_mode16 + mmsize]
+    pshufb            m2, m3
+    palignr           m1, m2, 10
+
+    mova              m5, [ang8_shuf_mode16]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode16]
+    mova              m3, [pw_1024]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
+    pmulhrsw          m1, m3
+    pmulhrsw          m2, m3
+    pmulhrsw          m4, m3
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    ang8_store8x8
+    RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang8_17, 3,4,6
+    vbroadcasti128    m1, [r2 + 17]
+    vbroadcasti128    m2, [r2]
+    mova              m3, [ang8_shuf_mode17 + mmsize]
+    pshufb            m2, m3
+    palignr           m1, m2, 9
+
+    mova              m5, [ang8_shuf_mode17]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode17]
+    mova              m3, [pw_1024]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
+    pmulhrsw          m1, m3
+    pmulhrsw          m2, m3
+    pmulhrsw          m4, m3
+    pmulhrsw          m0, m3
+    packuswb          m1, m2
+    packuswb          m4, m0
+
+    ang8_store8x8
     RET
 
 %if 1
@@ -19548,113 +19469,73 @@
 
 INIT_YMM avx2
 cglobal intra_pred_ang8_14, 3, 6, 6
+    movu              xm1, [r2 + 13]
+    vinserti128       m1, m1, xm1, 1
+
+    pinsrb            xm1, [r2 + 0], 3
+    pinsrb            xm1, [r2 + 2], 2
+    pinsrb            xm1, [r2 + 5], 1
+    pinsrb            xm1, [r2 + 7], 0
+    vinserti128       m1, m1, xm1, 1
+
+    mova              m5, [ang8_shuf_mode14]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode14]
     mova              m3, [pw_1024]
-    movu              xm5, [r2 + 16]
-    pinsrb            xm5, [r2], 0
-    lea               r5, [intra_pred_shuff_0_8]
-    vinserti128       m0, m5, xm5, 1
-    pshufb            m0, [r5]
-
-    lea               r4, [c_ang8_mode_14]
-    pmaddubsw         m1, m0, [r4]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 2], 0
-    vinserti128       m0, m5, xm5, 1
-    pshufb            m0, [r5]
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 5], 0
-    vinserti128       m0, m5, xm5, 1
-    pshufb            m0, [r5]
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 7], 0
-    pshufb            xm5, [r5]
-    vinserti128       m0, m0, xm5, 1
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2
 cglobal intra_pred_ang8_13, 3, 6, 6
+    movu              xm1, [r2 + 14]
+    pinsrb            xm1, [r2 + 0], 2
+    pinsrb            xm1, [r2 + 4], 1
+    pinsrb            xm1, [r2 + 7], 0
+    vinserti128       m1, m1, xm1, 1
+
+    mova              m5, [ang8_shuf_mode13]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode13]
     mova              m3, [pw_1024]
-    movu              xm5, [r2 + 16]
-    pinsrb            xm5, [r2], 0
-    lea               r5, [intra_pred_shuff_0_8]
-    vinserti128       m0, m5, xm5, 1
-    pshufb            m0, [r5]
-
-    lea               r4, [c_ang8_mode_13]
-    pmaddubsw         m1, m0, [r4]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 4], 0
-    pshufb            xm4, xm5, [r5]
-    vinserti128       m0, m0, xm4, 1
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    vinserti128       m0, m0, xm4, 0
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    pslldq            xm5, 1
-    pinsrb            xm5, [r2 + 7], 0
-    pshufb            xm5, [r5]
-    vinserti128       m0, m0, xm5, 1
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 
@@ -19703,51 +19584,36 @@
     RET
 
 INIT_YMM avx2
-cglobal intra_pred_ang8_12, 3, 5, 5
+cglobal intra_pred_ang8_12, 3, 5, 6
+    movu              xm1, [r2 + 15]
+    pinsrb            xm1, [r2 + 0], 1
+    pinsrb            xm1, [r2 + 6], 0
+    vinserti128       m1, m1, xm1, 1
+
+    mova              m5, [ang8_shuf_mode12]
+    mova              m3, [pb_2]
+    pshufb            m0, m1, m5
+    psubb             m5, m3
+    pshufb            m4, m1, m5
+    psubb             m5, m3
+    pshufb            m2, m1, m5
+    psubb             m5, m3
+    pshufb            m1, m5
+
+    vbroadcasti128    m5, [ang8_fact_mode12]
     mova              m3, [pw_1024]
-    movu              xm1, [r2 + 16]
-    pinsrb            xm1, [r2], 0
-    pshufb            xm1, [intra_pred_shuff_0_8]
-    vinserti128       m0, m1, xm1, 1
-
-    lea               r4, [c_ang8_mode_24]
-    pmaddubsw         m1, m0, [r4]
+    pmaddubsw         m1, m5
+    pmaddubsw         m2, m5
+    pmaddubsw         m4, m5
+    pmaddubsw         m0, m5
     pmulhrsw          m1, m3
-    pmaddubsw         m2, m0, [r4 + mmsize]
     pmulhrsw          m2, m3
-    pmaddubsw         m4, m0, [r4 + 2 * mmsize]
     pmulhrsw          m4, m3
-    pslldq            xm0, 2
-    pinsrb            xm0, [r2 + 6], 0
-    pinsrb            xm0, [r2 + 0], 1
-    vinserti128       m0, m0, xm0, 1
-    pmaddubsw         m0, [r4 + 3 * mmsize]
     pmulhrsw          m0, m3
     packuswb          m1, m2
     packuswb          m4, m0
 
-    vperm2i128        m2, m1, m4, 00100000b
-    vperm2i128        m1, m1, m4, 00110001b
-    punpcklbw         m4, m2, m1
-    punpckhbw         m2, m1
-    punpcklwd         m1, m4, m2
-    punpckhwd         m4, m2
-    mova              m0, [trans8_shuf]
-    vpermd            m1, m0, m1
-    vpermd            m4, m0, m4
-
-    lea               r3, [3 * r1]
-    movq              [r0], xm1
-    movhps            [r0 + r1], xm1
-    vextracti128      xm2, m1, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
-    lea               r0, [r0 + 4 * r1]
-    movq              [r0], xm4
-    movhps            [r0 + r1], xm4
-    vextracti128      xm2, m4, 1
-    movq              [r0 + 2 * r1], xm2
-    movhps            [r0 + r3], xm2
+    ang8_store8x8
     RET
 
 INIT_YMM avx2



More information about the x265-devel mailing list