[x265] [PATCH 3 of 3] asm: intra_pred_ang32_23 improved ~5% over AVX2 code, 1925.55c -> 1833.59c
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Mar 31 09:24:48 CEST 2015
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1427783740 -19800
# Tue Mar 31 12:05:40 2015 +0530
# Node ID f640b6d029e68dfc7a2f5ee55ffc61ceb2689550
# Parent d3e47b334726728ab200fed173168cdf581d3680
asm: intra_pred_ang32_23 improved ~5% over AVX2 code, 1925.55c -> 1833.59c
diff -r d3e47b334726 -r f640b6d029e6 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Mar 30 21:22:16 2015 +0530
+++ b/source/common/x86/intrapred8.asm Tue Mar 31 12:05:40 2015 +0530
@@ -501,6 +501,14 @@
c_mode32_22_shuff7: db 0, 0, 15, 13, 10, 8, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0
c_mode32_22_shuff8: times 4 db 0, 3, 5, 8
+
+ALIGN 32
+c_mode32_23_shuff: times 2 db 0, 0, 0, 14, 10, 7, 3, 0
+c_mode32_23_shuff1: times 2 db 0, 0, 14, 10, 7, 3, 0, 0
+c_mode32_23_shuff2: times 2 db 0, 14, 10, 7, 3, 0, 0, 0
+c_mode32_23_shuff3: times 2 db 14, 10, 7, 3, 0, 0, 0
+c_mode32_23_shuff4: times 2 db 10, 7, 3, 0, 0, 0, 0, 0
+
ALIGN 32
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
@@ -14560,7 +14568,7 @@
RET
INIT_YMM avx2
-cglobal intra_pred_ang32_23, 3, 5, 11
+cglobal intra_pred_ang32_23, 3, 5, 12
mova m0, [pw_1024]
mova m1, [intra_pred_shuff_0_8]
lea r3, [3 * r1]
@@ -14680,11 +14688,13 @@
movu [r0 + r1], m6
;row[14, 15]
- movu xm2, [r2 - 4]
- pinsrb xm2, [r2 + 78], 0
- pinsrb xm2, [r2 + 75], 1
- pinsrb xm2, [r2 + 71], 2
- pinsrb xm2, [r2 + 68], 3
+ movu xm11, [r2 + 68]
+ movu xm2, xm11
+ pshufb xm2, [c_mode32_23_shuff4]
+ pinsrw xm2, [r2 + 0], 2
+ pinsrw xm2, [r2 + 2], 3
+ pinsrb xm2, [r2 + 4], 8
+
vinserti128 m2, m2, xm2, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 4]
@@ -14714,12 +14724,11 @@
movu [r0], m6
;row[17, 18]
- movu xm2, [r2 - 5]
- pinsrb xm2, [r2 + 82], 0
- pinsrb xm2, [r2 + 78], 1
- pinsrb xm2, [r2 + 75], 2
- pinsrb xm2, [r2 + 71], 3
- pinsrb xm2, [r2 + 68], 4
+ mova xm2, xm11
+ pshufb xm2, [c_mode32_23_shuff3]
+ pinsrb xm2, [r2 + 0], 5
+ pinsrw xm2, [r2 + 1], 3
+ pinsrb xm2, [r2 + 3], 8
vinserti128 m2, m2, xm2, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 3]
@@ -14744,13 +14753,11 @@
movu [r0], m6
;row[21, 22]
- movu xm2, [r2 - 6]
+ mova xm2, xm11
+ pshufb xm2, [c_mode32_23_shuff2]
pinsrb xm2, [r2 + 85], 0
- pinsrb xm2, [r2 + 82], 1
- pinsrb xm2, [r2 + 78], 2
- pinsrb xm2, [r2 + 75], 3
- pinsrb xm2, [r2 + 71], 4
- pinsrb xm2, [r2 + 68], 5
+ pinsrw xm2, [r2 + 0], 3
+ pinsrb xm2, [r2 + 2], 8
vinserti128 m2, m2, xm2, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 2]
@@ -14779,14 +14786,13 @@
movu [r0 + r3], m6
;row[24, 25]
- movu xm2, [r2 - 7]
+ mova xm2, xm11
+ pshufb xm2, [c_mode32_23_shuff1]
pinsrb xm2, [r2 + 89], 0
pinsrb xm2, [r2 + 85], 1
- pinsrb xm2, [r2 + 82], 2
- pinsrb xm2, [r2 + 78], 3
- pinsrb xm2, [r2 + 75], 4
- pinsrb xm2, [r2 + 71], 5
- pinsrb xm2, [r2 + 68], 6
+ pinsrb xm2, [r2 + 0], 7
+ pinsrb xm2, [r2 + 1], 8
+
vinserti128 m2, m2, xm2, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 1]
@@ -14811,15 +14817,12 @@
movu [r0 + r3], m6
;row[28, 29]
- movu xm2, [r2 - 8]
+ mova xm2, xm11
+ pshufb xm2, [c_mode32_23_shuff]
pinsrb xm2, [r2 + 92], 0
pinsrb xm2, [r2 + 89], 1
pinsrb xm2, [r2 + 85], 2
- pinsrb xm2, [r2 + 82], 3
- pinsrb xm2, [r2 + 78], 4
- pinsrb xm2, [r2 + 75], 5
- pinsrb xm2, [r2 + 71], 6
- pinsrb xm2, [r2 + 68], 7
+ pinsrb xm2, [r2 + 0], 8
vinserti128 m2, m2, xm2, 1
pshufb m2, m1
vbroadcasti128 m3, [r2 + 0]
More information about the x265-devel
mailing list