<div dir="ltr">Please ignore duplicate patch (second), send my mistake. </div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr">Regards,<div>Praveen</div></div></div></div>
<br><div class="gmail_quote">On Fri, Mar 27, 2015 at 10:41 AM, <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5"># HG changeset patch<br>
# User Praveen Tiwari <<a href="mailto:praveen@multicorewareinc.com">praveen@multicorewareinc.com</a>><br>
# Date 1427356204 -19800<br>
# Thu Mar 26 13:20:04 2015 +0530<br>
# Branch stable<br>
# Node ID 24bdb3e594556ca6e12ee9dae58100a6bd115d2a<br>
# Parent 3d0f23cb0e58585e490362587022e67cfded143a<br>
asm: intra_pred_ang32_33 improved by ~35% over SSE4<br>
<br>
AVX2:<br>
intra_ang_32x32[33] 11.11x 2618.69 29084.27<br>
<br>
SSE4:<br>
intra_ang_32x32[33] 7.59x 4055.42 30792.64<br>
<br>
diff -r 3d0f23cb0e58 -r 24bdb3e59455 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Thu Mar 26 15:09:51 2015 -0500<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 26 13:20:04 2015 +0530<br>
@@ -1642,6 +1642,7 @@<br>
<a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].intra_pred[30] = x265_intra_pred_ang32_30_avx2;<br>
<a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].intra_pred[31] = x265_intra_pred_ang32_31_avx2;<br>
<a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].intra_pred[32] = x265_intra_pred_ang32_32_avx2;<br>
+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].intra_pred[33] = x265_intra_pred_ang32_33_avx2;<br>
<br>
// copy_sp primitives<br>
<a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;<br>
diff -r 3d0f23cb0e58 -r 24bdb3e59455 source/common/x86/intrapred.h<br>
--- a/source/common/x86/intrapred.h Thu Mar 26 15:09:51 2015 -0500<br>
+++ b/source/common/x86/intrapred.h Thu Mar 26 13:20:04 2015 +0530<br>
@@ -212,6 +212,7 @@<br>
void x265_intra_pred_ang32_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);<br>
void x265_intra_pred_ang32_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);<br>
void x265_intra_pred_ang32_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);<br>
+void x265_intra_pred_ang32_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);<br>
void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);<br>
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);<br>
void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);<br>
diff -r 3d0f23cb0e58 -r 24bdb3e59455 source/common/x86/intrapred8.asm<br>
--- a/source/common/x86/intrapred8.asm Thu Mar 26 15:09:51 2015 -0500<br>
+++ b/source/common/x86/intrapred8.asm Thu Mar 26 13:20:04 2015 +0530<br>
@@ -376,6 +376,37 @@<br>
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11<br>
db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0<br>
<br>
+<br>
+ALIGN 32<br>
+c_ang32_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26<br>
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20<br>
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14<br>
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8<br>
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28<br>
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22<br>
+ db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16<br>
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10<br>
+ db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30<br>
+ db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24<br>
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18<br>
+ db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12<br>
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6<br>
+ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26<br>
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20<br>
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14<br>
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8<br>
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28<br>
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22<br>
+ db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16<br>
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10<br>
+ db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30<br>
+ db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24<br>
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18<br>
+ db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12<br>
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6<br>
+ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0<br>
+<br>
+<br>
ALIGN 32<br>
;; (blkSize - 1 - x)<br>
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0<br>
@@ -13514,5 +13545,568 @@<br>
vpermq m6, m6, 11011000b<br>
movu [r0 + r3], m6<br>
RET<br>
+<br>
+INIT_YMM avx2<br>
+cglobal intra_pred_ang32_33, 3, 5, 11<br>
+ mova m0, [pw_1024]<br>
+ mova m1, [intra_pred_shuff_0_8]<br>
+ lea r3, [3 * r1]<br>
+ lea r4, [c_ang32_mode_33]<br>
+<br>
+ ;row [0]<br>
+ vbroadcasti128 m2, [r2 + 1]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 9]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 17]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 25]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0], m6<br>
+<br>
+ ;row [1]<br>
+ vbroadcasti128 m2, [r2 + 2]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 10]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 18]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 26]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [2]<br>
+ vbroadcasti128 m2, [r2 + 3]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 11]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 19]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 27]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [3]<br>
+ vbroadcasti128 m2, [r2 + 4]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 12]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 20]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 28]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [4, 5]<br>
+ vbroadcasti128 m2, [r2 + 5]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 13]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 21]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 29]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+<br>
+ INTRA_PRED_ANG32_CAL_ROW<br>
+ movu [r0], m7<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [6]<br>
+ vbroadcasti128 m2, [r2 + 6]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 14]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 22]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 30]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [7]<br>
+ vbroadcasti128 m2, [r2 + 7]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 15]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 23]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 31]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [8]<br>
+ vbroadcasti128 m2, [r2 + 8]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 16]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 24]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 32]<br>
+ pshufb m5, m1<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0], m6<br>
+<br>
+ ;row [9, 10]<br>
+ vbroadcasti128 m2, [r2 + 9]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 17]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 25]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 33]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+<br>
+ INTRA_PRED_ANG32_CAL_ROW<br>
+ movu [r0 + r1], m7<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [11]<br>
+ vbroadcasti128 m2, [r2 + 10]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 18]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 26]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 34]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [12]<br>
+ vbroadcasti128 m2, [r2 + 11]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 19]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 27]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 35]<br>
+ pshufb m5, m1<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0], m6<br>
+<br>
+ ;row [13]<br>
+ vbroadcasti128 m2, [r2 + 12]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 20]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 28]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 36]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [14]<br>
+ vbroadcasti128 m2, [r2 + 13]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 21]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 29]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 37]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [15, 16]<br>
+ vbroadcasti128 m2, [r2 + 14]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 22]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 30]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 38]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+<br>
+ INTRA_PRED_ANG32_CAL_ROW<br>
+ movu [r0 + r3], m7<br>
+ lea r0, [r0 + 4 * r1]<br>
+ movu [r0], m6<br>
+<br>
+ ;row [17]<br>
+ vbroadcasti128 m2, [r2 + 15]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 23]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 31]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 39]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [18]<br>
+ vbroadcasti128 m2, [r2 + 16]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 24]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 32]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 40]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [19]<br>
+ vbroadcasti128 m2, [r2 + 17]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 25]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 33]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 41]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [20, 21]<br>
+ vbroadcasti128 m2, [r2 + 18]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 26]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 34]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 42]<br>
+ pshufb m5, m1<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+<br>
+ INTRA_PRED_ANG32_CAL_ROW<br>
+ movu [r0], m7<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [22]<br>
+ vbroadcasti128 m2, [r2 + 19]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 27]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 35]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 43]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [23]<br>
+ vbroadcasti128 m2, [r2 + 20]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 28]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 36]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 44]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [24]<br>
+ vbroadcasti128 m2, [r2 + 21]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 29]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 37]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 45]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0], m6<br>
+<br>
+ ;row [25, 26]<br>
+ vbroadcasti128 m2, [r2 + 22]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 30]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 38]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 46]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+<br>
+ INTRA_PRED_ANG32_CAL_ROW<br>
+ movu [r0 + r1], m7<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [27]<br>
+ vbroadcasti128 m2, [r2 + 23]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 31]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 39]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 47]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+<br>
+ ;row [28]<br>
+ vbroadcasti128 m2, [r2 + 24]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 32]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 40]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 48]<br>
+ pshufb m5, m1<br>
+<br>
+ lea r0, [r0 + 4 * r1]<br>
+ mova m10, [r4 + 3 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0], m6<br>
+<br>
+ ;row [29]<br>
+ vbroadcasti128 m2, [r2 + 25]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 33]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 41]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 49]<br>
+ pshufb m5, m1<br>
+<br>
+ add r4, 4 * mmsize<br>
+ mova m10, [r4 + 0 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r1], m6<br>
+<br>
+ ;row [30]<br>
+ vbroadcasti128 m2, [r2 + 26]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 34]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 42]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 50]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 1 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + 2 * r1], m6<br>
+<br>
+ ;row [31]<br>
+ vbroadcasti128 m2, [r2 + 27]<br>
+ pshufb m2, m1<br>
+ vbroadcasti128 m3, [r2 + 35]<br>
+ pshufb m3, m1<br>
+ vbroadcasti128 m4, [r2 + 43]<br>
+ pshufb m4, m1<br>
+ vbroadcasti128 m5, [r2 + 51]<br>
+ pshufb m5, m1<br>
+<br>
+ mova m10, [r4 + 2 * mmsize]<br>
+ vperm2i128 m6, m2, m3, 00100000b<br>
+ pmaddubsw m6, m10<br>
+ pmulhrsw m6, m0<br>
+ vperm2i128 m7, m4, m5, 00100000b<br>
+ pmaddubsw m7, m10<br>
+ pmulhrsw m7, m0<br>
+ packuswb m6, m7<br>
+ vpermq m6, m6, 11011000b<br>
+ movu [r0 + r3], m6<br>
+ RET<br>
%endif<br>
<br>
</div></div></blockquote></div><br></div>