[x265] [PATCH 1 of 7] asm: intra_pred_ang32_23 improved by ~10% over AVX2, 1925.55c -> 1738.47c

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Apr 2 06:44:22 CEST 2015


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1427871926 -19800
#      Wed Apr 01 12:35:26 2015 +0530
# Node ID bd86ef402456d97515f28a71404e12ccd432e8db
# Parent  066092a3e5600c1c900d1d9b80e3ec08c3962a23
asm: intra_pred_ang32_23 improved by ~10% over AVX2, 1925.55c -> 1738.47c

diff -r 066092a3e560 -r bd86ef402456 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Tue Mar 31 18:27:56 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Wed Apr 01 12:35:26 2015 +0530
@@ -14562,8 +14562,8 @@
     lea               r4, [c_ang32_mode_23]
 
     ;row[0, 1]
-    vbroadcasti128    m2, [r2 + 0]
-    pshufb            m2, m1
+    vbroadcasti128    m11, [r2 + 0]
+    pshufb            m2, m11, m1
     vbroadcasti128    m3, [r2 + 8]
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 16]
@@ -14589,9 +14589,9 @@
     movu              [r0 + 2 * r1], m6
 
     ;row[3, 4]
-    movu              xm2, [r2 - 1]
-    pinsrb            xm2, [r2 + 68], 0
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 68], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 7]
     pshufb            m3, m1
@@ -14615,10 +14615,9 @@
     movu              [r0 + 2 * r1], m6
 
     ;row[7, 8]
-    movu              xm2, [r2 - 2]
-    pinsrb            xm2, [r2 + 71], 0
-    pinsrb            xm2, [r2 + 68], 1
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 71], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 6]
     pshufb            m3, m1
@@ -14647,11 +14646,9 @@
     movu              [r0 + r1], m6
 
     ;row[10, 11]
-    movu              xm2, [r2 - 3]
-    pinsrb            xm2, [r2 + 75], 0
-    pinsrb            xm2, [r2 + 71], 1
-    pinsrb            xm2, [r2 + 68], 2
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 75], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 5]
     pshufb            m3, m1
@@ -14675,14 +14672,9 @@
     movu              [r0 + r1], m6
 
     ;row[14, 15]
-    movu              xm11, [r2 + 68]
-    movu              xm2, xm11
-    pshufb            xm2, [c_mode32_23_shuff4]
-    pinsrw            xm2, [r2 +  0], 2
-    pinsrw            xm2, [r2 +  2], 3
-    pinsrb            xm2, [r2 +  4], 8
-
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 78], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 4]
     pshufb            m3, m1
@@ -14711,12 +14703,9 @@
     movu              [r0], m6
 
     ;row[17, 18]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_23_shuff3]
-    pinsrb            xm2, [r2 +  0], 5
-    pinsrw            xm2, [r2 +  1], 3
-    pinsrb            xm2, [r2 +  3], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 82], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 3]
     pshufb            m3, m1
@@ -14740,12 +14729,9 @@
     movu              [r0], m6
 
     ;row[21, 22]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_23_shuff2]
-    pinsrb            xm2, [r2 + 85], 0
-    pinsrw            xm2, [r2 +  0], 3
-    pinsrb            xm2, [r2 +  2], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 85], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 2]
     pshufb            m3, m1
@@ -14773,14 +14759,9 @@
     movu              [r0 + r3], m6
 
     ;row[24, 25]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_23_shuff1]
-    pinsrb            xm2, [r2 + 89], 0
-    pinsrb            xm2, [r2 + 85], 1
-    pinsrb            xm2, [r2 +  0], 7
-    pinsrb            xm2, [r2 +  1], 8
-
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 89], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 1]
     pshufb            m3, m1
@@ -14804,13 +14785,9 @@
     movu              [r0 + r3], m6
 
     ;row[28, 29]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_23_shuff]
-    pinsrb            xm2, [r2 + 92], 0
-    pinsrb            xm2, [r2 + 89], 1
-    pinsrb            xm2, [r2 + 85], 2
-    pinsrb            xm2, [r2 +  0], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 92], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 0]
     pshufb            m3, m1


More information about the x265-devel mailing list