<div dir="ltr"><br clear="all"><div><div class="gmail_signature"><div dir="ltr">Regards,<div>Praveen</div></div></div></div>
<br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername"></b> <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>></span><br>Date: Thu, Apr 16, 2015 at 3:43 PM<br>Subject: [x265] [PATCH] asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE<br>To: <a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br><br><br># HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1429173485 -19800<br>
# Thu Apr 16 14:08:05 2015 +0530<br>
# Node ID 04e7526a8bde9e46867f5c4cfb63b98409c7fb44<br>
# Parent ebca2a0d3ab905b62c346d5d0b23d50c618d5827<br>
asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE<br>
<br>
AVX2:<br>
satd[48x64] 12.52x 7696.91 96366.03<br>
satd[64x48] 12.16x 8103.43 98564.64<br>
satd[64x16] 12.15x 2759.65 33537.19<br>
satd[64x32] 12.12x 5372.52 65090.38<br>
satd[64x64] 13.02x 10260.38 133615.69<br>
<br>
SSE:<br>
satd[48x64] 5.32x 18146.13 96505.38<br>
satd[64x48] 5.33x 18201.03 96975.23<br>
satd[64x16] 5.21x 6272.14 32651.24<br>
satd[64x32] 5.42x 11910.58 64529.81<br>
satd[64x64] 5.30x 26665.73 141387.59</div><div class="gmail_quote"><br></div><div class="gmail_quote"><br></div><div class="gmail_quote">Please, correct the commit messages 100% improvement means your new code is taking zero cycles. % improvement is calculated as below</div><div class="gmail_quote"><br></div><div class="gmail_quote">(old cycles - new cycles) * 100 / old cycles.</div><div class="gmail_quote"><br></div><div class="gmail_quote">so 48x64, (18146 - 7696) * 100 / 18146 = ~57%</div><div class="gmail_quote"><br></div><div class="gmail_quote">Please do correct previous commit messages also if you have done same thing. </div><div class="gmail_quote"><br></div><div class="gmail_quote"><br></div><div class="gmail_quote"><br>
<br>
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Thu Apr 16 12:22:53 2015 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 16 14:08:05 2015 +0530<br>
@@ -1707,6 +1707,11 @@<br>
p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_avx2;<br>
p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_avx2;<br>
p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_avx2;<br>
+ p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;<br>
+ p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;<br>
+ p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_avx2;<br>
+ p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_avx2;<br>
+ p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_avx2;<br>
<br>
p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;<br>
p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;<br>
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm Thu Apr 16 12:22:53 2015 +0530<br>
+++ b/source/common/x86/pixel-a.asm Thu Apr 16 14:08:05 2015 +0530<br>
@@ -10903,4 +10903,279 @@<br>
movd eax, xm0<br>
RET<br>
<br>
+cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2)<br>
+ mova m7, [hmul_16p]<br>
+ lea r4, [3 * r1]<br>
+ lea r5, [3 * r3]<br>
+ pxor m6, m6<br>
+ mov r6, r0<br>
+ mov r7, r2<br>
+<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 16]<br>
+ lea r2, [r7 + 16]<br>
+ mova m9, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m9, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 32]<br>
+ lea r2, [r7 + 32]<br>
+ mova m8, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m8, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+<br>
+ pmaddwd m6, [pw_1]<br>
+ vextracti128 xm2, m9, 1<br>
+ vextracti128 xm1, m8, 1<br>
+ vextracti128 xm0, m6, 1<br>
+ paddd xm2, xm9<br>
+ paddd xm1, xm8<br>
+ paddd xm0, xm6<br>
+ paddd xm0, xm1<br>
+ paddd xm0, xm2<br>
+ movhlps xm7, xm0<br>
+ paddd xm0, xm7<br>
+ pshuflw xm7, xm0, q0032<br>
+ paddd xm0, xm7<br>
+ movd eax, xm0<br>
+ RET<br>
+<br>
+cglobal pixel_satd_64x16, 4,8,8 ; if WIN64 && cpuflag(avx2)<br>
+ mova m7, [hmul_16p]<br>
+ lea r4, [3 * r1]<br>
+ lea r5, [3 * r3]<br>
+ pxor m6, m6<br>
+ mov r6, r0<br>
+ mov r7, r2<br>
+<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 16]<br>
+ lea r2, [r7 + 16]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 32]<br>
+ lea r2, [r7 + 32]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 48]<br>
+ lea r2, [r7 + 48]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+<br>
+ vextracti128 xm0, m6, 1<br>
+ paddw xm0, xm6<br>
+ pmaddwd xm0, [pw_1]<br>
+ movhlps xm7, xm0<br>
+ paddd xm0, xm7<br>
+ pshuflw xm7, xm0, q0032<br>
+ paddd xm0, xm7<br>
+ movd eax, xm0<br>
+ RET<br>
+<br>
+cglobal pixel_satd_64x32, 4,8,9 ; if WIN64 && cpuflag(avx2)<br>
+ mova m7, [hmul_16p]<br>
+ lea r4, [3 * r1]<br>
+ lea r5, [3 * r3]<br>
+ pxor m6, m6<br>
+ mov r6, r0<br>
+ mov r7, r2<br>
+<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 16]<br>
+ lea r2, [r7 + 16]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 32]<br>
+ lea r2, [r7 + 32]<br>
+ mova m8, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m8, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 48]<br>
+ lea r2, [r7 + 48]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+<br>
+ pmaddwd m6, [pw_1]<br>
+ vextracti128 xm1, m8, 1<br>
+ vextracti128 xm0, m6, 1<br>
+ paddd xm1, xm8<br>
+ paddd xm0, xm6<br>
+ paddd xm0, xm1<br>
+ movhlps xm7, xm0<br>
+ paddd xm0, xm7<br>
+ pshuflw xm7, xm0, q0032<br>
+ paddd xm0, xm7<br>
+ movd eax, xm0<br>
+ RET<br>
+<br>
+cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2)<br>
+ mova m7, [hmul_16p]<br>
+ lea r4, [3 * r1]<br>
+ lea r5, [3 * r3]<br>
+ pxor m6, m6<br>
+ mov r6, r0<br>
+ mov r7, r2<br>
+<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 16]<br>
+ lea r2, [r7 + 16]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ mova m8, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m8, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 32]<br>
+ lea r2, [r7 + 32]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ mova m9, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m9, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 48]<br>
+ lea r2, [r7 + 48]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+<br>
+ pmaddwd m6, [pw_1]<br>
+ vextracti128 xm2, m9, 1<br>
+ vextracti128 xm1, m8, 1<br>
+ vextracti128 xm0, m6, 1<br>
+ paddd xm2, xm9<br>
+ paddd xm1, xm8<br>
+ paddd xm0, xm6<br>
+ paddd xm0, xm2<br>
+ paddd xm0, xm1<br>
+ movhlps xm7, xm0<br>
+ paddd xm0, xm7<br>
+ pshuflw xm7, xm0, q0032<br>
+ paddd xm0, xm7<br>
+ movd eax, xm0<br>
+ RET<br>
+<br>
+cglobal pixel_satd_64x64, 4,8,11 ; if WIN64 && cpuflag(avx2)<br>
+ mova m7, [hmul_16p]<br>
+ lea r4, [3 * r1]<br>
+ lea r5, [3 * r3]<br>
+ pxor m6, m6<br>
+ mov r6, r0<br>
+ mov r7, r2<br>
+<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 16]<br>
+ lea r2, [r7 + 16]<br>
+ mova m10, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m10, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 32]<br>
+ lea r2, [r7 + 32]<br>
+ mova m9, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m9, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ lea r0, [r6 + 48]<br>
+ lea r2, [r7 + 48]<br>
+ mova m8, m6 ; to avoid overflow, move to another register<br>
+ pxor m6, m6<br>
+ pmaddwd m8, [pw_1]<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+ call calc_satd_16x8<br>
+<br>
+ pmaddwd m6, [pw_1]<br>
+ vextracti128 xm3, m10, 1<br>
+ vextracti128 xm2, m9, 1<br>
+ vextracti128 xm1, m8, 1<br>
+ vextracti128 xm0, m6, 1<br>
+ paddd xm3, xm10<br>
+ paddd xm2, xm9<br>
+ paddd xm1, xm8<br>
+ paddd xm0, xm6<br>
+ paddd xm0, xm3<br>
+ paddd xm0, xm2<br>
+ paddd xm0, xm1<br>
+ movhlps xm7, xm0<br>
+ paddd xm0, xm7<br>
+ pshuflw xm7, xm0, q0032<br>
+ paddd xm0, xm7<br>
+ movd eax, xm0<br>
+ RET<br>
+<br>
%endif ; if ARCH_X86_64 == 1<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</div><br></div>