[x265] [PATCH 4 of 7] asm: optimized intra_ang16 mode 11 avx2 asm, 520c->370c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Aug 26 12:24:33 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1440480737 -19800
# Tue Aug 25 11:02:17 2015 +0530
# Node ID a27ac3b998f5677570a48285d22e1b771c08ab75
# Parent 630bae9a91392fdf9a327673f7c00eeedf60139f
asm: optimized intra_ang16 mode 11 avx2 asm, 520c->370c
diff -r 630bae9a9139 -r a27ac3b998f5 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Tue Aug 25 10:48:24 2015 +0530
+++ b/source/common/x86/intrapred8.asm Tue Aug 25 11:02:17 2015 +0530
@@ -16610,127 +16610,105 @@
RET
INIT_YMM avx2
-cglobal intra_pred_ang16_11, 3,4,5
- mova m0, [angHor_tab_11]
- mova m1, [pw_1024]
+cglobal intra_pred_ang16_11, 3,4,8
+ vbroadcasti128 m0, [angHor_tab_11]
+ vbroadcasti128 m1, [angHor_tab_11 + mmsize/2]
+ mova m2, [pw_1024]
+ mova m7, [ang32_shuf_mode9]
lea r3, [r1 * 3]
- ; prepare for [0 -1 -2 ...]
- movu xm2, [r2 + 32]
- ; TODO: input reference pixel buffer need a duplicate of pixel_lt to avoid reduce instruction in every mode
- pinsrb xm2, [r2], 0
- pshufb xm2, [intra_pred_shuff_0_8] ; [0 1 1 2 2 3 3 4 4 5 5 6 6 7 7 8]
-
-
- vpbroadcastw m3, xm2 ; word [1 0]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [2 1]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0], xm3
- vextracti128 [r0 + r1], m3, 1
-
- vpbroadcastw m3, xm2 ; word [3 2]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [4 3]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0 + r1 * 2], xm3
- vextracti128 [r0 + r3], m3, 1
+ ; prepare for [0 -1 -2...]
+
+ movu xm3, [r2 + mmsize]
+ pinsrb xm3, [r2], 0
+ vbroadcasti128 m6, [r2 + mmsize + 16]
+ vinserti128 m3, m3, xm3, 1
+
+ pshufb m5, m3, m7 ; [ 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 1 2 1 2 1 2 1 2 1 2 1 2 1 2 1 2]
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0], xm4
+ vextracti128 [r0 + r1], m4, 1
+
+ palignr m5, m6, m3, 2
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0 + r1 * 2], xm4
+ vextracti128 [r0 + r3], m4, 1
+
lea r0, [r0 + r1 * 4]
- vpbroadcastw m3, xm2 ; word [5 4]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [6 5]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0], xm3
- vextracti128 [r0 + r1], m3, 1
-
- vpbroadcastw m3, xm2 ; word [7 6]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [8 7]
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0 + r1 * 2], xm3
- vextracti128 [r0 + r3], m3, 1
+ palignr m5, m6, m3, 4
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0], xm4
+ vextracti128 [r0 + r1], m4, 1
+
+ palignr m5, m6, m3, 6
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0 + r1 * 2], xm4
+ vextracti128 [r0 + r3], m4, 1
+
lea r0, [r0 + r1 * 4]
- ; loading new reference pixels
- movu xm2, [r2 + 32 + 8]
- pshufb xm2, [intra_pred_shuff_0_8] ; [8 9 9 A A B B C C D D E E F F 10]
-
- vpbroadcastw m3, xm2 ; word [9 8]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [A 9]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0], xm3
- vextracti128 [r0 + r1], m3, 1
-
- vpbroadcastw m3, xm2 ; word [B A]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [C B]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0 + r1 * 2], xm3
- vextracti128 [r0 + r3], m3, 1
+ palignr m5, m6, m3, 8
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0], xm4
+ vextracti128 [r0 + r1], m4, 1
+
+ palignr m5, m6, m3, 10
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0 + r1 * 2], xm4
+ vextracti128 [r0 + r3], m4, 1
+
lea r0, [r0 + r1 * 4]
- vpbroadcastw m3, xm2 ; word [D C]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [E D]
- psrldq xm2, 2
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0], xm3
- vextracti128 [r0 + r1], m3, 1
-
- vpbroadcastw m3, xm2 ; word [F E]
- psrldq xm2, 2
- vpbroadcastw m4, xm2 ; word [10 F]
- pmaddubsw m3, m0
- pmaddubsw m4, m0
- pmulhrsw m3, m1
- pmulhrsw m4, m1
- packuswb m3, m4
- vpermq m3, m3, q3120
- movu [r0 + r1 * 2], xm3
- vextracti128 [r0 + r3], m3, 1
- RET
+ palignr m5, m6, m3, 12
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0], xm4
+ vextracti128 [r0 + r1], m4, 1
+
+ palignr m5, m6, m3, 14
+ pshufb m5, m7
+ pmaddubsw m4, m5, m0
+ pmaddubsw m5, m1
+ pmulhrsw m4, m2
+ pmulhrsw m5, m2
+ packuswb m4, m5
+ movu [r0 + r1 * 2], xm4
+ vextracti128 [r0 + r3], m4, 1
+ RET
; transpose 8x32 to 16x16, used for intra_ang16x16 avx2 asm
More information about the x265-devel
mailing list