[x265] [PATCH] asm: reduced 'vpermq' instructions, improved 1040c->800c, 22%

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Sep 25 06:55:04 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1442311334 -19800
#      Tue Sep 15 15:32:14 2015 +0530
# Node ID df65e7f6acf475d33a6ce377167df1a5a2355950
# Parent  d890ce2af3de2f1c23f5bf07fef6471417b7f8ef
asm: reduced 'vpermq' instructions, improved 1040c->800c, 22%

diff -r d890ce2af3de -r df65e7f6acf4 source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm	Fri Sep 25 10:20:30 2015 +0530
+++ b/source/common/x86/intrapred8_allangs.asm	Tue Sep 15 15:32:14 2015 +0530
@@ -27,62 +27,63 @@
 
 SECTION_RODATA 32
 
-all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
-                db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
-                db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
-                db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
-                db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
-                db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
-                db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
-                db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
-
-all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
-          db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
-          db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
-          db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
-          db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
-          db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
-          db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
-          db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
-          db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
-          db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
-          db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
-          db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
-          db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
-          db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+const allAng4_shuf_mode2,       db  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7
+const allAng4_shuf_mode3_4,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode5_6,     db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode7_8,     db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4,  1,  2,  2,  3,  3,  4,  4,  5,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode10,      db  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3,  0,  1,  2,  3
+const allAng4_shuf_mode11_12,   db  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12,  0,  9,  9, 10, 10, 11, 11, 12
+const allAng4_shuf_mode13_14,   db  0,  9,  9, 10, 10, 11, 11, 12,  4,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11,  2,  0,  0,  9,  9, 10, 10, 11
+const allAng4_shuf_mode15_16,   db  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11,  0,  9,  9, 10, 10, 11, 11, 12,  2,  0,  0,  9,  9, 10, 10, 11
+                                db  2,  0,  0,  9,  9, 10, 10, 11,  4,  2,  2,  0,  0,  9,  9, 10,  2,  0,  0,  9,  9, 10, 10, 11,  3,  2,  2,  0,  0,  9,  9, 10
+const allAng4_shuf_mode17,      db  0,  9,  9, 10, 10, 11, 11, 12,  1,  0,  0,  9,  9, 10, 10, 11,  2,  1,  1,  0,  0,  9,  9, 10,  4,  2,  2,  1,  1,  0,  0,  9
+                                db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode18,      db  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0,  0,  1,  2,  3,  9,  0,  1,  2, 10,  9,  0,  1, 11, 10,  9,  0
+const allAng4_shuf_mode19_20,   db  0,  1,  1,  2,  2,  3,  3,  4,  9,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3
+                                db 10,  9,  9,  0,  0,  1,  1,  2, 12, 10, 10,  9,  9,  0,  0,  1, 10,  0,  0,  1,  1,  2,  2,  3, 11, 10, 10,  0,  0,  1,  1,  2
+const allAng4_shuf_mode21_22,   db  0,  1,  1,  2,  2,  3,  3,  4, 10,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db 10,  0,  0,  1,  1,  2,  2,  3, 12, 10, 10,  0,  0,  1,  1,  2, 10,  0,  0,  1,  1,  2,  2,  3, 10,  0,  0,  1,  1,  2,  2,  3
+const allAng4_shuf_mode23_24,   db  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+                                db  0,  1,  1,  2,  2,  3,  3,  4, 12,  0,  0,  1,  1,  2,  2,  3,  0,  1,  1,  2,  2,  3,  3,  4,  0,  1,  1,  2,  2,  3,  3,  4
+const allAng4_shuf_mode26,      db  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4,  1,  2,  3,  4
+const allAng4_shuf_mode27_28,   db  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5,  1,  2,  2,  3,  3,  4,  4,  5
+const allAng4_shuf_mode29_30,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6,  2,  3,  3,  4,  4,  5,  5,  6
+const allAng4_shuf_mode31_32,   db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6
+                                db  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7
+const allAng4_shuf_mode33,      db  1,  2,  2,  3,  3,  4,  4,  5,  2,  3,  3,  4,  4,  5,  5,  6,  3,  4,  4,  5,  5,  6,  6,  7,  4,  5,  5,  6,  6,  7,  7,  8
+const allAng4_shuf_mode34,      db  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8,  2,  3,  4,  5,  3,  4,  5,  6,  4,  5,  6,  7,  5,  6,  7,  8
+
+const allAng4_fact_mode3_4,     db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode5_6,     db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode7_8,     db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode9,       db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8
+const allAng4_fact_mode11_12,   db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode13_14,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode15_16,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode17,      db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode19_20,   db 26,  6, 26,  6, 26,  6, 26,  6, 20, 12, 20, 12, 20, 12, 20, 12, 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 14, 18, 14, 18, 14, 18, 14, 18,  8, 24,  8, 24,  8, 24,  8, 24, 31,  1, 31,  1, 31,  1, 31,  1, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode21_22,   db 17, 15, 17, 15, 17, 15, 17, 15,  2, 30,  2, 30,  2, 30,  2, 30, 13, 19, 13, 19, 13, 19, 13, 19, 26,  6, 26,  6, 26,  6, 26,  6
+                                db 19, 13, 19, 13, 19, 13, 19, 13,  4, 28,  4, 28,  4, 28,  4, 28,  7, 25,  7, 25,  7, 25,  7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode23_24,   db  9, 23,  9, 23,  9, 23,  9, 23, 18, 14, 18, 14, 18, 14, 18, 14,  5, 27,  5, 27,  5, 27,  5, 27, 10, 22, 10, 22, 10, 22, 10, 22
+                                db 27,  5, 27,  5, 27,  5, 27,  5,  4, 28,  4, 28,  4, 28,  4, 28, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+const allAng4_fact_mode25,      db  2, 30,  2, 30,  2, 30,  2, 30,  4, 28,  4, 28,  4, 28,  4, 28,  6, 26,  6, 26,  6, 26,  6, 26,  8, 24,  8, 24,  8, 24,  8, 24
+const allAng4_fact_mode27_28,   db 30,  2, 30,  2, 30,  2, 30,  2, 28,  4, 28,  4, 28,  4, 28,  4, 27,  5, 27,  5, 27,  5, 27,  5, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 26,  6, 26,  6, 26,  6, 26,  6, 24,  8, 24,  8, 24,  8, 24,  8, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode29_30,   db 23,  9, 23,  9, 23,  9, 23,  9, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13,  6, 26,  6, 26,  6, 26,  6, 26
+                                db  5, 27,  5, 27,  5, 27,  5, 27, 28,  4, 28,  4, 28,  4, 28,  4, 25,  7, 25,  7, 25,  7, 25,  7, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode31_32,   db 15, 17, 15, 17, 15, 17, 15, 17, 30,  2, 30,  2, 30,  2, 30,  2, 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10
+                                db 13, 19, 13, 19, 13, 19, 13, 19, 28,  4, 28,  4, 28,  4, 28,  4,  1, 31,  1, 31,  1, 31,  1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+const allAng4_fact_mode33,      db  6, 26,  6, 26,  6, 26,  6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24,  8, 24,  8, 24,  8, 24,  8
 
 
 SECTION .text
@@ -23075,80 +23076,69 @@
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal all_angs_pred_4x4, 4, 4, 6
+cglobal all_angs_pred_4x4, 2, 2, 6
 
     mova           m5, [pw_1024]
-    lea            r2, [all_ang4]
-    lea            r3, [all_ang4_shuff]
 
 ; mode 2
 
     vbroadcasti128 m0, [r1 + 9]
-    mova           xm1, xm0
-    psrldq         xm1, 1
-    pshufb         xm1, [r3]
+    pshufb         m1, m0, [allAng4_shuf_mode2]
     movu           [r0], xm1
 
 ; mode 3
 
-    pshufb         m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m1, [r2]
+    pshufb         m1, m0, [allAng4_shuf_mode3_4]
+    pmaddubsw      m1, [allAng4_fact_mode3_4]
     pmulhrsw       m1, m5
 
 ; mode 4
 
-    pshufb         m2, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode3_4 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode3_4 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (3 - 2) * 16], m1
 
 ; mode 5
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode5_6]
+    pmaddubsw      m1, [allAng4_fact_mode5_6]
     pmulhrsw       m1, m5
 
 ; mode 6
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode5_6 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode5_6 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (5 - 2) * 16], m1
 
-    add            r3, 4 * mmsize
-    add            r2, 4 * mmsize
-
 ; mode 7
 
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode7_8]
+    pmaddubsw      m1, m3, [allAng4_fact_mode7_8]
     pmulhrsw       m1, m5
 
 ; mode 8
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode7_8 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode7_8 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (7 - 2) * 16], m1
 
 ; mode 9
 
-    pshufb         m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
-    pmulhrsw       m1, m5
-    packuswb       m1, m1
-    vpermq         m1, m1, 11011000b
-    movu           [r0 + (9 - 2) * 16], xm1
+    pmaddubsw      m3, [allAng4_fact_mode9]
+    pmulhrsw       m3, m5
+    packuswb       m3, m3
+    vpermq         m3, m3, 11011000b
+    movu           [r0 + (9 - 2) * 16], xm3
 
 ; mode 10
 
-    pshufb         xm1, xm0, [r3 + 2 * mmsize]
+    pshufb         xm1, xm0, [allAng4_shuf_mode10]
     movu           [r0 + (10 - 2) * 16], xm1
 
     pxor           xm1, xm1
@@ -23173,135 +23163,111 @@
 ; mode 11
 
     vbroadcasti128 m0, [r1]
-    pshufb         m1, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m1, [r2 + 3 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode11_12]
+    pmaddubsw      m1, m3, [allAng4_fact_mode11_12]
     pmulhrsw       m1, m5
 
 ; mode 12
 
-    add            r2, 4 * mmsize
-
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 0 * mmsize]
+    pmaddubsw      m2, m3, [allAng4_fact_mode11_12 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (11 - 2) * 16], m1
 
 ; mode 13
 
-    add            r3, 4 * mmsize
-
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 1 * mmsize]
+    pmaddubsw      m3, [allAng4_fact_mode13_14]
+    pmulhrsw       m3, m5
+
+; mode 14
+
+    pshufb         m2, m0, [allAng4_shuf_mode13_14]
+    pmaddubsw      m2, [allAng4_fact_mode13_14 + mmsize]
+    pmulhrsw       m2, m5
+    packuswb       m3, m2
+    movu           [r0 + (13 - 2) * 16], m3
+
+; mode 15
+
+    pshufb         m1, m0, [allAng4_shuf_mode15_16]
+    pmaddubsw      m1, [allAng4_fact_mode15_16]
     pmulhrsw       m1, m5
 
-; mode 14
-
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 2 * mmsize]
+; mode 16
+
+    pshufb         m2, m0, [allAng4_shuf_mode15_16 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode15_16 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
-    movu           [r0 + (13 - 2) * 16], m1
-
-; mode 15
-
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 3 * mmsize]
-    pmulhrsw       m1, m5
-
-; mode 16
-
-    add            r2, 4 * mmsize
-
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 0 * mmsize]
-    pmulhrsw       m2, m5
-    packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (15 - 2) * 16], m1
 
 ; mode 17
 
-    add            r3, 4 * mmsize
-
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 1 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode17]
+    pmaddubsw      m1, [allAng4_fact_mode17]
     pmulhrsw       m1, m5
     packuswb       m1, m1
     vpermq         m1, m1, 11011000b
 
 ; mode 18
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode18]
     vinserti128    m1, m1, xm2, 1
     movu           [r0 + (17 - 2) * 16], m1
 
 ; mode 19
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode19_20]
+    pmaddubsw      m1, [allAng4_fact_mode19_20]
     pmulhrsw       m1, m5
 
 ; mode 20
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode19_20 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode19_20 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (19 - 2) * 16], m1
 
 ; mode 21
 
-    add            r2, 4 * mmsize
-    add            r3, 4 * mmsize
-
-    pshufb         m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode21_22]
+    pmaddubsw      m1, [allAng4_fact_mode21_22]
     pmulhrsw       m1, m5
 
 ; mode 22
 
-    pshufb         m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw      m2, [r2 + 1 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode21_22 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode21_22 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (21 - 2) * 16], m1
 
 ; mode 23
 
-    pshufb         m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw      m1, [r2 + 2 * mmsize]
+    pshufb         m3, m0, [allAng4_shuf_mode23_24]
+    pmaddubsw      m1, m3, [allAng4_fact_mode23_24]
     pmulhrsw       m1, m5
 
 ; mode 24
 
-    pshufb         m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m2, [r2 + 3 * mmsize]
+    pshufb         m2, m0, [allAng4_shuf_mode23_24 + mmsize]
+    pmaddubsw      m2, [allAng4_fact_mode23_24 + mmsize]
     pmulhrsw       m2, m5
     packuswb       m1, m2
-    vpermq         m1, m1, 11011000b
     movu           [r0 + (23 - 2) * 16], m1
 
 ; mode 25
 
-    add            r2, 4 * mmsize
-
-    pshufb         m1, m0, [r3 + 3 * mmsize]
-    pmaddubsw      m1, [r2 + 0 * mmsize]
-    pmulhrsw       m1, m5
-    packuswb       m1, m1
-    vpermq         m1, m1, 11011000b
-    movu           [r0 + (25 - 2) * 16], xm1
+    pmaddubsw      m3, [allAng4_fact_mode25]
+    pmulhrsw       m3, m5
+    packuswb       m3, m3
+    vpermq         m3, m3, 11011000b
+    movu           [r0 + (25 - 2) * 16], xm3
 
 ; mode 26
 
-    add            r3, 4 * mmsize
-
-    pshufb         xm1, xm0, [r3 + 0 * mmsize]
+    pshufb         m1, m0, [allAng4_shuf_mode26]
     movu           [r0 + (26 - 2) * 16], xm1
 
     pxor           xm1, xm1
@@ -23326,64 +23292,55 @@
 
 ; mode 27
 
-    pshufb        m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw     m1, [r2 + 1 * mmsize]
+    pshufb        m3, m0, [allAng4_shuf_mode27_28]
+    pmaddubsw     m1, m3, [allAng4_fact_mode27_28]
     pmulhrsw      m1, m5
 
 ; mode 28
 
-    pshufb        m2, m0, [r3 + 1 * mmsize]
-    pmaddubsw     m2, [r2 + 2 * mmsize]
+    pmaddubsw     m2, m3, [allAng4_fact_mode27_28 + mmsize]
     pmulhrsw      m2, m5
     packuswb      m1, m2
-    vpermq        m1, m1, 11011000b
     movu          [r0 + (27 - 2) * 16], m1
 
 ; mode 29
 
-    pshufb        m1, m0, [r3 + 2 * mmsize]
-    pmaddubsw     m1, [r2 + 3 * mmsize]
+    pmaddubsw     m3, [allAng4_fact_mode29_30]
+    pmulhrsw      m3, m5
+
+; mode 30
+
+    pshufb        m2, m0, [allAng4_shuf_mode29_30]
+    pmaddubsw     m2, [allAng4_fact_mode29_30 + mmsize]
+    pmulhrsw      m2, m5
+    packuswb      m3, m2
+    movu          [r0 + (29 - 2) * 16], m3
+
+; mode 31
+
+    pshufb        m1, m0, [allAng4_shuf_mode31_32]
+    pmaddubsw     m1, [allAng4_fact_mode31_32]
     pmulhrsw      m1, m5
 
-; mode 30
-
-    add           r2, 4 * mmsize
-
-    pshufb        m2, m0, [r3 + 3 * mmsize]
-    pmaddubsw     m2, [r2 + 0 * mmsize]
+; mode 32
+
+    pshufb        m2, m0, [allAng4_shuf_mode31_32 + mmsize]
+    pmaddubsw     m2, [allAng4_fact_mode31_32 + mmsize]
     pmulhrsw      m2, m5
     packuswb      m1, m2
-    vpermq        m1, m1, 11011000b
-    movu          [r0 + (29 - 2) * 16], m1
-
-; mode 31
-
-    add           r3, 4 * mmsize
-
-    pshufb        m1, m0, [r3 + 0 * mmsize]
-    pmaddubsw     m1, [r2 + 1 * mmsize]
-    pmulhrsw      m1, m5
-
-; mode 32
-
-    pshufb        m2, m0, [r3 + 0 * mmsize]
-    pmaddubsw     m2, [r2 + 2 * mmsize]
-    pmulhrsw      m2, m5
-    packuswb      m1, m2
-    vpermq        m1, m1, 11011000b
     movu          [r0 + (31 - 2) * 16], m1
 
 ; mode 33
 
-    pshufb        m1, m0, [r3 + 1 * mmsize]
-    pmaddubsw     m1, [r2 + 3 * mmsize]
+    pshufb        m1, m0, [allAng4_shuf_mode33]
+    pmaddubsw     m1, [allAng4_fact_mode33]
     pmulhrsw      m1, m5
     packuswb      m1, m2
     vpermq        m1, m1, 11011000b
 
 ; mode 34
 
-    pshufb        m0, [r3 + 2 * mmsize]
+    pshufb        m0, [allAng4_shuf_mode34]
     vinserti128   m1, m1, xm0, 1
     movu          [r0 + (33 - 2) * 16], m1
     RET


More information about the x265-devel mailing list