[x265] [PATCH] all_angs_pred_32x32, asm code improvement

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Feb 27 11:56:06 CET 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1393498551 -19800
# Node ID d3e3baaf80b490f330d2171e454ad5b7856acaa7
# Parent  7de2875c614058648475618d2b9faa5a9611225b
all_angs_pred_32x32, asm code improvement

diff -r 7de2875c6140 -r d3e3baaf80b4 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Wed Feb 26 17:58:24 2014 +0530
+++ b/source/common/x86/intrapred8.asm	Thu Feb 27 16:25:51 2014 +0530
@@ -23689,8 +23689,6 @@
 pmaddubsw     m3,    m1,          m6
 pmulhrsw      m3,    m7
 pslldq        m4,    2
-;pinsrb        m4,    [r4 + 4],    1
-;pinsrb        m4,    [r4 + 3],    0
 pinsrw        m4,     [r4 + 3],   0
 pmaddubsw     m5,    m4,          m6
 pmulhrsw      m5,    m7
@@ -24461,13 +24459,11 @@
 packuswb      m4,    m5
 movu          [r0 +  1120 * 16],    m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 4],      1
-pinsrb        m1,    [r3 + 3],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 12],     1
-pinsrb        m3,    [r3 + 11],     0
+pinsrw        m1,    [r3 + 3],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 11],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24488,13 +24484,11 @@
 packuswb      m4,    m5
 movu          [r0 +  1122 * 16],    m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 3],      1
-pinsrb        m1,    [r3 + 2],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 11],     1
-pinsrb        m3,    [r3 + 10],     0
+pinsrw        m1,    [r3 + 2],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 10],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24515,13 +24509,11 @@
 packuswb      m4,    m5
 movu          [r0 +  1124 * 16],    m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 2],      1
-pinsrb        m1,    [r3 + 1],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 10],     1
-pinsrb        m3,    [r3 +  9],     0
+pinsrw        m1,    [r3 + 1],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 +  9],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24542,13 +24534,11 @@
 packuswb      m4,    m5
 movu          [r0 +  1126 * 16],    m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 1],      1
-pinsrb        m1,    [r3 + 0],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 9],      1
-pinsrb        m3,    [r3 + 8],      0
+pinsrw        m1,    [r3 + 0],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 8],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24616,8 +24606,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 7],     1
-pinsrb        m3,    [r3 + 6],     0
+pinsrw        m3,    [r3 + 6],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24643,8 +24632,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 6],     1
-pinsrb        m3,    [r3 + 5],     0
+pinsrw        m3,    [r3 + 5],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24670,8 +24658,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 5],     1
-pinsrb        m3,    [r3 + 4],     0
+pinsrw        m3,    [r3 + 4],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24697,8 +24684,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 4],     1
-pinsrb        m3,    [r3 + 3],     0
+pinsrw        m3,    [r3 + 3],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24739,8 +24725,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 3],     1
-pinsrb        m3,    [r3 + 2],     0
+pinsrw        m3,    [r3 + 2],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24766,8 +24751,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 2],     1
-pinsrb        m3,    [r3 + 1],     0
+pinsrw        m3,    [r3 + 1],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24793,8 +24777,7 @@
 pmaddubsw     m4,    m1,           m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 1],     1
-pinsrb        m3,    [r3 + 0],     0
+pinsrw        m3,    [r3 + 0],      0
 pmaddubsw     m5,    m3,           m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24873,20 +24856,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrw        m2,    [r3 + 7],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1154 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24915,20 +24895,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 7],      1
-pinsrb        m2,    [r3 + 6],      0
+pinsrw        m2,    [r3 + 6],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1158 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 15],     1
-pinsrb        m1,    [r3 + 14],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 23],     1
-pinsrb        m3,    [r3 + 22],     0
+pinsrw        m1,    [r3 + 14],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 22],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24942,20 +24919,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 6],      1
-pinsrb        m2,    [r3 + 5],      0
+pinsrw        m2,    [r3 + 5],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1160 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 14],     1
-pinsrb        m1,    [r3 + 13],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 22],     1
-pinsrb        m3,    [r3 + 21],     0
+pinsrw        m1,    [r3 + 13],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 21],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -24984,20 +24958,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 5],      1
-pinsrb        m2,    [r3 + 4],      0
+pinsrw        m2,    [r3 + 4],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1164 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 13],     1
-pinsrb        m1,    [r3 + 12],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 21],     1
-pinsrb        m3,    [r3 + 20],     0
+pinsrw        m1,    [r3 + 12],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 20],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25011,20 +24982,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 4],      1
-pinsrb        m2,    [r3 + 3],      0
+pinsrw        m2,    [r3 + 3],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1166 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 12],     1
-pinsrb        m1,    [r3 + 11],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 20],     1
-pinsrb        m3,    [r3 + 19],     0
+pinsrw        m1,    [r3 + 11],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 19],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25060,13 +25028,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1170 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 11],     1
-pinsrb        m1,    [r3 + 10],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 19],     1
-pinsrb        m3,    [r3 + 18],     0
+pinsrw        m1,    [r3 + 10],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 18],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25080,20 +25046,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 2],      1
-pinsrb        m2,    [r3 + 1],      0
+pinsrw        m2,    [r3 + 1],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1172 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 10],     1
-pinsrb        m1,    [r3 + 9],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 18],     1
-pinsrb        m3,    [r3 + 17],     0
+pinsrw        m1,    [r3 + 9],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 17],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25129,13 +25092,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1176 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 9],      1
-pinsrb        m1,    [r3 + 8],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 17],     1
-pinsrb        m3,    [r3 + 16],     0
+pinsrw        m1,    [r3 + 8],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 16],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25156,13 +25117,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1178 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 8],      1
-pinsrb        m1,    [r3 + 7],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 16],     1
-pinsrb        m3,    [r3 + 15],     0
+pinsrw        m1,    [r3 + 7],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 15],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25198,13 +25157,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1182 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 7],      1
-pinsrb        m1,    [r3 + 6],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 15],     1
-pinsrb        m3,    [r3 + 14],     0
+pinsrw        m1,    [r3 + 6],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 14],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25225,13 +25182,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1184 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 6],      1
-pinsrb        m1,    [r3 + 5],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 14],     1
-pinsrb        m3,    [r3 + 13],     0
+pinsrw        m1,    [r3 + 5],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 13],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25267,13 +25222,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1188 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 5],      1
-pinsrb        m1,    [r3 + 4],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 13],     1
-pinsrb        m3,    [r3 + 12],     0
+pinsrw        m1,    [r3 + 4],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 12],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25294,13 +25247,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1190 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 4],      1
-pinsrb        m1,    [r3 + 3],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 12],     1
-pinsrb        m3,    [r3 + 11],     0
+pinsrw        m1,    [r3 + 3],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 11],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25336,13 +25287,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1194 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 3],      1
-pinsrb        m1,    [r3 + 2],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 11],     1
-pinsrb        m3,    [r3 + 10],     0
+pinsrw        m1,    [r3 + 2],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 10],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25363,13 +25312,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1196 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 2],      1
-pinsrb        m1,    [r3 + 1],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 10],     1
-pinsrb        m3,    [r3 +  9],     0
+pinsrw        m1,    [r3 + 1],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 9],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25405,13 +25352,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1200 * 16],      m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 1],       1
-pinsrb        m1,    [r3 + 0],       0
+pinsrw        m1,    [r3 + 0],      0
 pmaddubsw     m4,    m1,             m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 9],       1
-pinsrb        m3,    [r3 + 8],       0
+pinsrw        m3,    [r3 + 8],       0
 pmaddubsw     m5,    m3,             m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25437,8 +25382,7 @@
 pmaddubsw     m4,    m1,             m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 8],       1
-pinsrb        m3,    [r3 + 7],       0
+pinsrw        m3,    [r3 + 7],      0
 pmaddubsw     m5,    m3,             m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25479,8 +25423,7 @@
 pmaddubsw     m4,    m1,             m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 7],       1
-pinsrb        m3,    [r3 + 6],       0
+pinsrw        m3,    [r3 + 6],      0
 pmaddubsw     m5,    m3,             m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25506,8 +25449,7 @@
 pmaddubsw     m4,    m1,             m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 6],       1
-pinsrb        m3,    [r3 + 5],       0
+pinsrw        m3,    [r3 + 5],      0
 pmaddubsw     m5,    m3,             m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25548,8 +25490,7 @@
 pmaddubsw     m4,    m1,             m6
 pmulhrsw      m4,    m7
 pslldq        m3,    2
-pinsrb        m3,    [r3 + 5],       1
-pinsrb        m3,    [r3 + 4],       0
+pinsrw        m3,    [r3 + 4],      0
 pmaddubsw     m5,    m3,             m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25601,20 +25542,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrw        m2,    [r3 + 7],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1218 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25643,20 +25581,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 7],      1
-pinsrb        m2,    [r3 + 6],      0
+pinsrw        m2,    [r3 + 6],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1222 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 15],     1
-pinsrb        m1,    [r3 + 14],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 23],     1
-pinsrb        m3,    [r3 + 22],     0
+pinsrw        m1,    [r3 + 14],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 22],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25685,20 +25620,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 6],      1
-pinsrb        m2,    [r3 + 5],      0
+pinsrw        m2,    [r3 + 5],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1226 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 14],     1
-pinsrb        m1,    [r3 + 13],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 22],     1
-pinsrb        m3,    [r3 + 21],     0
+pinsrw        m1,    [r3 + 13],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 21],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25727,20 +25659,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 5],      1
-pinsrb        m2,    [r3 + 4],      0
+pinsrw        m2,    [r3 + 4],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1230 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 13],     1
-pinsrb        m1,    [r3 + 12],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 21],     1
-pinsrb        m3,    [r3 + 20],     0
+pinsrw        m1,    [r3 + 12],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 20],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25769,20 +25698,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 4],      1
-pinsrb        m2,    [r3 + 3],      0
+pinsrw        m2,    [r3 + 3],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1234 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 12],     1
-pinsrb        m1,    [r3 + 11],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 20],     1
-pinsrb        m3,    [r3 + 19],     0
+pinsrw        m1,    [r3 + 11],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 19],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25811,20 +25737,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 3],      1
-pinsrb        m2,    [r3 + 2],      0
+pinsrw        m2,    [r3 + 2],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1238 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 11],     1
-pinsrb        m1,    [r3 + 10],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 19],     1
-pinsrb        m3,    [r3 + 18],     0
+pinsrw        m1,    [r3 + 10],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 18],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25853,20 +25776,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 2],      1
-pinsrb        m2,    [r3 + 1],      0
+pinsrw        m2,    [r3 + 1],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1242 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 10],     1
-pinsrb        m1,    [r3 +  9],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 18],     1
-pinsrb        m3,    [r3 + 17],     0
+pinsrw        m1,    [r3 + 9],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 17],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25895,20 +25815,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 1],      1
-pinsrb        m2,    [r3 + 0],      0
+pinsrw        m2,    [r3 + 0],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1246 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 9],      1
-pinsrb        m1,    [r3 + 8],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 17],     1
-pinsrb        m3,    [r3 + 16],     0
+pinsrw        m1,    [r3 + 8],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 16],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -25929,13 +25846,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1248 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 8],      1
-pinsrb        m1,    [r3 + 7],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 16],     1
-pinsrb        m3,    [r3 + 15],     0
+pinsrw        m1,    [r3 + 7],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 15],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26013,13 +25928,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1256 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 6],      1
-pinsrb        m1,    [r3 + 5],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 14],     1
-pinsrb        m3,    [r3 + 13],     0
+pinsrw        m1,    [r3 + 5],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 13],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26055,13 +25968,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1260 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 5],      1
-pinsrb        m1,    [r3 + 4],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 13],     1
-pinsrb        m3,    [r3 + 12],     0
+pinsrw        m1,    [r3 + 4],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 12],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26097,13 +26008,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1264 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 4],      1
-pinsrb        m1,    [r3 + 3],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 12],     1
-pinsrb        m3,    [r3 + 11],     0
+pinsrw        m1,    [r3 + 3],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 11],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26139,13 +26048,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1268 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 3],      1
-pinsrb        m1,    [r3 + 2],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 11],     1
-pinsrb        m3,    [r3 + 10],     0
+pinsrw        m1,    [r3 + 2],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 10],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26181,13 +26088,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1272 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 2],      1
-pinsrb        m1,    [r3 + 1],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 10],     1
-pinsrb        m3,    [r3 +  9],     0
+pinsrw        m1,    [r3 + 1],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 9],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26223,13 +26128,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1276 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 1],      1
-pinsrb        m1,    [r3 + 0],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 9],      1
-pinsrb        m3,    [r3 + 8],      0
+pinsrw        m1,    [r3 + 0],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 8],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26296,20 +26199,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrw        m2,    [r3 + 7],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1284 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26338,20 +26238,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 7],      1
-pinsrb        m2,    [r3 + 6],      0
+pinsrw        m2,    [r3 + 6],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1288 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 15],     1
-pinsrb        m1,    [r3 + 14],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 23],     1
-pinsrb        m3,    [r3 + 22],     0
+pinsrw        m1,    [r3 + 14],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 22],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26395,20 +26292,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 6],      1
-pinsrb        m2,    [r3 + 5],      0
+pinsrw        m2,    [r3 + 5],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1294 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 14],     1
-pinsrb        m1,    [r3 + 13],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 22],     1
-pinsrb        m3,    [r3 + 21],     0
+pinsrw        m1,    [r3 + 13],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 21],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26437,20 +26331,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 5],      1
-pinsrb        m2,    [r3 + 4],      0
+pinsrw        m2,    [r3 + 4],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1298 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 13],     1
-pinsrb        m1,    [r3 + 12],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 21],     1
-pinsrb        m3,    [r3 + 20],     0
+pinsrw        m1,    [r3 + 12],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 20],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26494,20 +26385,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 4],      1
-pinsrb        m2,    [r3 + 3],      0
+pinsrw        m2,    [r3 + 3],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1304 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 12],     1
-pinsrb        m1,    [r3 + 11],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 20],     1
-pinsrb        m3,    [r3 + 19],     0
+pinsrw        m1,    [r3 + 11],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 19],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26536,20 +26424,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 3],      1
-pinsrb        m2,    [r3 + 2],      0
+pinsrw        m2,    [r3 + 2],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1308 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 11],     1
-pinsrb        m1,    [r3 + 10],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 19],     1
-pinsrb        m3,    [r3 + 18],     0
+pinsrw        m1,    [r3 + 10],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 18],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26593,20 +26478,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 2],      1
-pinsrb        m2,    [r3 + 1],      0
+pinsrw        m2,    [r3 + 1],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1314 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 10],     1
-pinsrb        m1,    [r3 +  9],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 18],     1
-pinsrb        m3,    [r3 + 17],     0
+pinsrw        m1,    [r3 + 9],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 17],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26635,20 +26517,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 1],      1
-pinsrb        m2,    [r3 + 0],      0
+pinsrw        m2,    [r3 + 0],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1318 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 9],      1
-pinsrb        m1,    [r3 + 8],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 17],     1
-pinsrb        m3,    [r3 + 16],     0
+pinsrw        m1,    [r3 + 8],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 16],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26699,13 +26578,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1324 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 8],      1
-pinsrb        m1,    [r3 + 7],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 16],     1
-pinsrb        m3,    [r3 + 15],     0
+pinsrw        m1,    [r3 + 7],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 15],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26741,13 +26618,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1328 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 7],      1
-pinsrb        m1,    [r3 + 6],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 15],     1
-pinsrb        m3,    [r3 + 14],     0
+pinsrw        m1,    [r3 + 6],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 14],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26798,13 +26673,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1334 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 6],      1
-pinsrb        m1,    [r3 + 5],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 14],     1
-pinsrb        m3,    [r3 + 13],     0
+pinsrw        m1,    [r3 +  5],     0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 13],     0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26840,13 +26713,11 @@
 packuswb      m4,    m5
 movu          [r0 + 1338 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 5],      1
-pinsrb        m1,    [r3 + 4],      0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 13],     1
-pinsrb        m3,    [r3 + 12],     0
+pinsrw        m1,    [r3 +  4],     0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 12],     0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -26943,20 +26814,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrw        m2,    [r3 + 7],     0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1350 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],     0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],     0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27015,20 +26883,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 7],      1
-pinsrb        m2,    [r3 + 6],      0
+pinsrw        m2,    [r3 + 6],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1358 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 15],     1
-pinsrb        m1,    [r3 + 14],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 23],     1
-pinsrb        m3,    [r3 + 22],     0
+pinsrw        m1,    [r3 + 14],     0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 22],     0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27072,20 +26937,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 6],      1
-pinsrb        m2,    [r3 + 5],      0
+pinsrw        m2,    [r3 + 5],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1364 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 14],     1
-pinsrb        m1,    [r3 + 13],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 22],     1
-pinsrb        m3,    [r3 + 21],     0
+pinsrw        m1,    [r3 + 13],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 21],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27144,20 +27006,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 5],      1
-pinsrb        m2,    [r3 + 4],      0
+pinsrw        m2,    [r3 + 4],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1372 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 13],     1
-pinsrb        m1,    [r3 + 12],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 21],     1
-pinsrb        m3,    [r3 + 20],     0
+pinsrw        m1,    [r3 + 12],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 20],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27201,20 +27060,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 4],      1
-pinsrb        m2,    [r3 + 3],      0
+pinsrw        m2,    [r3 + 3],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1378 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 12],     1
-pinsrb        m1,    [r3 + 11],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 20],     1
-pinsrb        m3,    [r3 + 19],     0
+pinsrw        m1,    [r3 + 11],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 19],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27273,20 +27129,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 3],      1
-pinsrb        m2,    [r3 + 2],      0
+pinsrw        m2,    [r3 + 2],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1386 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 11],     1
-pinsrb        m1,    [r3 + 10],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 19],     1
-pinsrb        m3,    [r3 + 18],     0
+pinsrw        m1,    [r3 + 10],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 18],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27330,20 +27183,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 2],      1
-pinsrb        m2,    [r3 + 1],      0
+pinsrw        m2,    [r3 + 1],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1392 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 10],     1
-pinsrb        m1,    [r3 +  9],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 18],     1
-pinsrb        m3,    [r3 + 17],     0
+pinsrw        m1,    [r3 + 9],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 17],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27402,20 +27252,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 1],      1
-pinsrb        m2,    [r3 + 0],      0
+pinsrw        m2,    [r3 + 0],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1400 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 9],     1
-pinsrb        m1,    [r3 + 8],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 17],     1
-pinsrb        m3,    [r3 + 16],     0
+pinsrw        m1,    [r3 + 8],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 16],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27572,20 +27419,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrw        m2,    [r3 + 7],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1420 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27674,20 +27518,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 7],      1
-pinsrb        m2,    [r3 + 6],      0
+pinsrw        m2,    [r3 + 6],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1432 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 15],     1
-pinsrb        m1,    [r3 + 14],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 23],     1
-pinsrb        m3,    [r3 + 22],     0
+pinsrw        m1,    [r3 + 14],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 22],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27791,20 +27632,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 6],      1
-pinsrb        m2,    [r3 + 5],      0
+pinsrw        m2,    [r3 + 5],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1446 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 14],     1
-pinsrb        m1,    [r3 + 13],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 22],     1
-pinsrb        m3,    [r3 + 21],     0
+pinsrw        m1,    [r3 + 13],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 21],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -27893,20 +27731,17 @@
 pmaddubsw     m4,    m0,            m6
 pmulhrsw      m4,    m7
 pslldq        m2,    2
-pinsrb        m2,    [r3 + 5],      1
-pinsrb        m2,    [r3 + 4],      0
+pinsrw        m2,    [r3 + 4],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1458 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 13],     1
-pinsrb        m1,    [r3 + 12],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 21],     1
-pinsrb        m3,    [r3 + 20],     0
+pinsrw        m1,    [r3 + 12],      0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 20],      0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
@@ -28249,24 +28084,21 @@
 movu          m6,    [r5 + 30 * 16]
 pslldq        m0,    2
 pinsrb        m0,    [r4 + 0],      1
-pinsrb        m0,    [r4 + 16],      0
-pmaddubsw     m4,    m0,            m6
-pmulhrsw      m4,    m7
-pslldq        m2,    2
-pinsrb        m2,    [r3 + 8],      1
-pinsrb        m2,    [r3 + 7],      0
+pinsrb        m0,    [r4 + 16],     0
+pmaddubsw     m4,    m0,            m6
+pmulhrsw      m4,    m7
+pslldq        m2,    2
+pinsrw        m2,    [r3 + 7],      0
 pmaddubsw     m5,    m2,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5
 movu          [r0 + 1504 * 16],     m4
 pslldq        m1,    2
-pinsrb        m1,    [r3 + 16],     1
-pinsrb        m1,    [r3 + 15],     0
-pmaddubsw     m4,    m1,            m6
-pmulhrsw      m4,    m7
-pslldq        m3,    2
-pinsrb        m3,    [r3 + 24],     1
-pinsrb        m3,    [r3 + 23],     0
+pinsrw        m1,    [r3 + 15],     0
+pmaddubsw     m4,    m1,            m6
+pmulhrsw      m4,    m7
+pslldq        m3,    2
+pinsrw        m3,    [r3 + 23],     0
 pmaddubsw     m5,    m3,            m6
 pmulhrsw      m5,    m7
 packuswb      m4,    m5


More information about the x265-devel mailing list