[x265] [PATCH 3 of 3] asm: intra_pred_ang32_22 improved by ~5% over AVX2, 2308.11c -> 2207.80c

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Apr 1 07:39:32 CEST 2015


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1427806676 -19800
#      Tue Mar 31 18:27:56 2015 +0530
# Node ID 066092a3e5600c1c900d1d9b80e3ec08c3962a23
# Parent  51f4a0edf3c9f62fb5fe5167da0553143f28e0ba
asm: intra_pred_ang32_22 improved by ~5% over AVX2, 2308.11c -> 2207.80c

diff -r 51f4a0edf3c9 -r 066092a3e560 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Tue Mar 31 17:49:48 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Mar 31 18:27:56 2015 +0530
@@ -490,17 +490,6 @@
                  db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
                  db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
-ALIGN 32
-c_mode32_22_shuff:       times 2 db 0, 15, 13, 10, 8, 5, 3, 0
-c_mode32_22_shuff1:      times 2 db 15, 13, 10, 8, 5, 3, 0, 0
-c_mode32_22_shuff2:      times 2 db 13, 10, 8, 5, 3, 0, 0, 0
-c_mode32_22_shuff3:      times 2 db 10, 8, 5, 3, 0, 0, 0, 0
-c_mode32_22_shuff4:      times 2 db 8, 5, 3, 0, 0, 0, 0, 0
-c_mode32_22_shuff5:      times 2 db 0, 0, 15, 13, 10, 8, 5, 3
-c_mode32_22_shuff6:      db 0, 0, 15, 12, 10, 7, 5, 2, 0, 0, 0, 0, 0, 0, 0, 0
-c_mode32_22_shuff7:      db 0, 0, 15, 13, 10, 8, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0
-c_mode32_22_shuff8:      times 4 db 0, 3, 5, 8
-
 
 ALIGN 32
 c_mode32_23_shuff:       times 2 db 0, 0, 0, 14, 10, 7, 3, 0
@@ -14847,15 +14836,15 @@
     RET
 
 INIT_YMM avx2
-cglobal intra_pred_ang32_22, 3, 5, 12
+cglobal intra_pred_ang32_22, 3, 5, 13
     mova              m0, [pw_1024]
     mova              m1, [intra_pred_shuff_0_8]
     lea               r3, [3 * r1]
     lea               r4, [c_ang32_mode_22]
 
     ;row[0, 1]
-    vbroadcasti128    m2, [r2 + 0]
-    pshufb            m2, m1
+    vbroadcasti128    m11, [r2 + 0]
+    pshufb            m2, m11, m1
     vbroadcasti128    m3, [r2 + 8]
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 16]
@@ -14870,9 +14859,9 @@
     movu              [r0 + r1], m6
 
     ;row[2, 3]
-    movu              xm2, [r2 - 1]
-    pinsrb            xm2, [r2 + 66], 0
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 66], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 7]
     pshufb            m3, m1
@@ -14888,10 +14877,9 @@
     movu              [r0 + r3], m6
 
     ;row[4, 5]
-    movu              xm2, [r2 - 2]
-    pinsrb            xm2, [r2 + 69], 0
-    pinsrb            xm2, [r2 + 66], 1
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 69], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 6]
     pshufb            m3, m1
@@ -14919,11 +14907,9 @@
     movu              [r0 + 2 * r1], m6
 
     ;row[7, 8]
-    movu              xm2, [r2 - 3]
-    pinsrb            xm2, [r2 + 71], 0
-    pinsrb            xm2, [r2 + 69], 1
-    pinsrb            xm2, [r2 + 66], 2
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 71], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 5]
     pshufb            m3, m1
@@ -14941,12 +14927,9 @@
     movu              [r0], m6
 
     ;row[9, 10]
-    movu              xm11, [r2 + 66]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff4]
-    pinsrw            xm2, [r2 + 0], 2
-    pinsrw            xm2, [r2 + 2], 3
-    pinsrb            xm2, [r2 + 4], 8
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 74], 0
+    vinserti128       m2, m11, xm11, 1
     vinserti128       m2, m2, xm2, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 4]
@@ -14974,12 +14957,9 @@
     movu              [r0 + r3], m6
 
     ;row[12, 13]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff3]
-    pinsrb            xm2, [r2 + 0], 5
-    pinsrw            xm2, [r2 + 1], 3
-    pinsrb            xm2, [r2 + 3], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 76], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 3]
     pshufb            m3, m1
@@ -14996,11 +14976,9 @@
     movu              [r0 + r1], m6
 
     ;row[14, 15]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff2]
-    pinsrw            xm2, [r2 + 0], 3
-    pinsrb            xm2, [r2 + 2], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 79], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 2]
     pshufb            m3, m1
@@ -15029,11 +15007,9 @@
     movu              [r0], m6
 
     ;row[17, 18]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff1]
-    pinsrb            xm2, [r2 + 0], 7
-    pinsrb            xm2, [r2 + 1], 8
-    vinserti128       m2, m2, xm2, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 81], 0
+    vinserti128       m2, m11, xm11, 1
     pshufb            m2, m1
     vbroadcasti128    m3, [r2 + 1]
     pshufb            m3, m1
@@ -15049,14 +15025,12 @@
     movu              [r0 + 2 * r1], m6
 
     ;row[19, 20]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff]
-    pinsrb            xm2, [r2 + 84], 0
-    pinsrb            xm2, [r2 + 0], 8
-    vinserti128       m2, m2, xm2, 1
-    pshufb            m2, m1
-    vbroadcasti128    m3, [r2 + 0]
-    pshufb            m3, m1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 84], 0
+    vinserti128       m2, m11, xm11, 1
+    pshufb            m2, m1
+    vbroadcasti128    m12, [r2 + 0]
+    pshufb            m3, m12, m1
     vbroadcasti128    m4, [r2 + 8]
     pshufb            m4, m1
     vbroadcasti128    m5, [r2 + 16]
@@ -15082,15 +15056,13 @@
     movu              [r0 + r1], m6
 
     ;row[22, 23]
-    mova              xm2, xm11
-    pshufb            xm2, [c_mode32_22_shuff5]
-    pinsrb            xm2, [r2 + 86], 0
-    pinsrb            xm2, [r2 + 84], 1
-    vinserti128       m2, m2, xm2, 1
-    pshufb            m2, m1
-    movu              xm3, [r2 - 1]
-    pinsrb            xm3, [r2 + 66], 0
-    vinserti128       m3, m3, xm3, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 86], 0
+    vinserti128       m2, m11, xm11, 1
+    pshufb            m2, m1
+    pslldq            xm12, 1
+    pinsrb            xm12, [r2 + 66], 0
+    vinserti128       m3, m12, xm12, 1
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 7]
     pshufb            m4, m1
@@ -15104,16 +15076,13 @@
     movu              [r0 + r3], m6
 
     ;row[24, 25]
-    movu              xm2, [r2 + 69]
-    pshufb            xm2, [c_mode32_22_shuff6]
-    pinsrb            xm2, [r2 + 86], 1
-    pinsrb            xm2, [r2 + 89], 0
-    vinserti128       m2, m2, xm2, 1
-    pshufb            m2, m1
-    movu              xm3, [r2 - 2]
-    pinsrb            xm3, [r2 + 69], 0
-    pinsrb            xm3, [r2 + 66], 1
-    vinserti128       m3, m3, xm3, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 89], 0
+    vinserti128       m2, m11, xm11, 1
+    pshufb            m2, m1
+    pslldq            xm12, 1
+    pinsrb            xm12, [r2 + 69], 0
+    vinserti128       m3, m12, xm12, 1
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 6]
     pshufb            m4, m1
@@ -15139,17 +15108,13 @@
     movu              [r0 + 2 * r1], m6
 
     ;row[27, 28]
-    movu              xm2, [r2 + 71]
-    pshufb            xm2, [c_mode32_22_shuff7]
-    pinsrb            xm2, [r2 + 89], 1
-    pinsrb            xm2, [r2 + 91], 0
-    vinserti128       m2, m2, xm2, 1
-    pshufb            m2, m1
-    movu              xm3, [r2 - 3]
-    pinsrb            xm3, [r2 + 71], 0
-    pinsrb            xm3, [r2 + 69], 1
-    pinsrb            xm3, [r2 + 66], 2
-    vinserti128       m3, m3, xm3, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 91], 0
+    vinserti128       m2, m11, xm11, 1
+    pshufb            m2, m1
+    pslldq            xm12, 1
+    pinsrb            xm12, [r2 + 71], 0
+    vinserti128       m3, m12, xm12, 1
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 5]
     pshufb            m4, m1
@@ -15165,19 +15130,13 @@
     movu              [r0], m6
 
     ;row[29, 30]
-    movu              xm2, [r2 + 74]
-    pshufb            xm2, [c_mode32_22_shuff6]
-    pinsrb            xm2, [r2 + 91], 1
-    pinsrb            xm2, [r2 + 94], 0
-    vinserti128       m2, m2, xm2, 1
-    pshufb            m2, m1
-
-    movu              xm3, [r2 - 4]
-    pinsrb            xm3, [r2 + 74], 0
-    pinsrb            xm3, [r2 + 71], 1
-    pinsrb            xm3, [r2 + 69], 2
-    pinsrb            xm3, [r2 + 66], 3
-    vinserti128       m3, m3, xm3, 1
+    pslldq            xm11, 1
+    pinsrb            xm11, [r2 + 94], 0
+    vinserti128       m2, m11, xm11, 1
+    pshufb            m2, m1
+    pslldq            xm12, 1
+    pinsrb            xm12, [r2 + 74], 0
+    vinserti128       m3, m12, xm12, 1
     pshufb            m3, m1
     vbroadcasti128    m4, [r2 + 4]
     pshufb            m4, m1


More information about the x265-devel mailing list