[x265] [PATCH 1 of 7] asm: intra_pred_ang32_23 improved by ~10% over AVX2, 1925.55c -> 1738.47c
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Apr 2 06:44:22 CEST 2015
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1427871926 -19800
# Wed Apr 01 12:35:26 2015 +0530
# Node ID bd86ef402456d97515f28a71404e12ccd432e8db
# Parent 066092a3e5600c1c900d1d9b80e3ec08c3962a23
asm: intra_pred_ang32_23 improved by ~10% over AVX2, 1925.55c -> 1738.47c
diff -r 066092a3e560 -r bd86ef402456 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Tue Mar 31 18:27:56 2015 +0530
+++ b/source/common/x86/intrapred8.asm Wed Apr 01 12:35:26 2015 +0530
@@ -14562,8 +14562,8 @@
lea r4, [c_ang32_mode_23]
;row[0, 1]
- vbroadcasti128 m2, [r2 + 0]
- pshufb m2, m1
+ vbroadcasti128 m11, [r2 + 0]
+ pshufb m2, m11, m1
vbroadcasti128 m3, [r2 + 8]
pshufb m3, m1
vbroadcasti128 m4, [r2 + 16]
@@ -14589,9 +14589,9 @@
movu [r0 + 2 * r1], m6
;row[3, 4]
- movu xm2, [r2 - 1]
- pinsrb xm2, [r2 + 68], 0
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 68], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 7]
pshufb m3, m1
@@ -14615,10 +14615,9 @@
movu [r0 + 2 * r1], m6
;row[7, 8]
- movu xm2, [r2 - 2]
- pinsrb xm2, [r2 + 71], 0
- pinsrb xm2, [r2 + 68], 1
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 71], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 6]
pshufb m3, m1
@@ -14647,11 +14646,9 @@
movu [r0 + r1], m6
;row[10, 11]
- movu xm2, [r2 - 3]
- pinsrb xm2, [r2 + 75], 0
- pinsrb xm2, [r2 + 71], 1
- pinsrb xm2, [r2 + 68], 2
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 75], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 5]
pshufb m3, m1
@@ -14675,14 +14672,9 @@
movu [r0 + r1], m6
;row[14, 15]
- movu xm11, [r2 + 68]
- movu xm2, xm11
- pshufb xm2, [c_mode32_23_shuff4]
- pinsrw xm2, [r2 + 0], 2
- pinsrw xm2, [r2 + 2], 3
- pinsrb xm2, [r2 + 4], 8
-
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 78], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 4]
pshufb m3, m1
@@ -14711,12 +14703,9 @@
movu [r0], m6
;row[17, 18]
- mova xm2, xm11
- pshufb xm2, [c_mode32_23_shuff3]
- pinsrb xm2, [r2 + 0], 5
- pinsrw xm2, [r2 + 1], 3
- pinsrb xm2, [r2 + 3], 8
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 82], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 3]
pshufb m3, m1
@@ -14740,12 +14729,9 @@
movu [r0], m6
;row[21, 22]
- mova xm2, xm11
- pshufb xm2, [c_mode32_23_shuff2]
- pinsrb xm2, [r2 + 85], 0
- pinsrw xm2, [r2 + 0], 3
- pinsrb xm2, [r2 + 2], 8
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 85], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 2]
pshufb m3, m1
@@ -14773,14 +14759,9 @@
movu [r0 + r3], m6
;row[24, 25]
- mova xm2, xm11
- pshufb xm2, [c_mode32_23_shuff1]
- pinsrb xm2, [r2 + 89], 0
- pinsrb xm2, [r2 + 85], 1
- pinsrb xm2, [r2 + 0], 7
- pinsrb xm2, [r2 + 1], 8
-
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 89], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 1]
pshufb m3, m1
@@ -14804,13 +14785,9 @@
movu [r0 + r3], m6
;row[28, 29]
- mova xm2, xm11
- pshufb xm2, [c_mode32_23_shuff]
- pinsrb xm2, [r2 + 92], 0
- pinsrb xm2, [r2 + 89], 1
- pinsrb xm2, [r2 + 85], 2
- pinsrb xm2, [r2 + 0], 8
- vinserti128 m2, m2, xm2, 1
+ pslldq xm11, 1
+ pinsrb xm11, [r2 + 92], 0
+ vinserti128 m2, m11, xm11, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 0]
pshufb m3, m1
More information about the x265-devel
mailing list