<div dir="ltr"><div>Please ignore the previous mail. Below is the most recent updated version of the patch.</div><div><br></div><div># HG changeset patch</div><div># User Ramya Sriraman <<a href="mailto:ramya@multicorewareinc.com">ramya@multicorewareinc.com</a>></div><div># Date 1443592336 -19800</div><div># Wed Sep 30 11:22:16 2015 +0530</div><div># Node ID 73b301b038c84d7520337c1097d5e2307766a9e4</div><div># Parent 6e7761bdfe23addb862483f8407b388800de7d92</div><div>asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2</div><div><br></div><div>diff -r 6e7761bdfe23 -r 73b301b038c8 source/common/x86/asm-primitives.cpp</div><div>--- a/source/common/x86/asm-primitives.cpp<span class="" style="white-space:pre"> </span>Wed Sep 30 14:57:15 2015 +0530</div><div>+++ b/source/common/x86/asm-primitives.cpp<span class="" style="white-space:pre"> </span>Wed Sep 30 11:22:16 2015 +0530</div><div>@@ -2677,6 +2677,10 @@</div><div> #if X86_64</div><div> if (cpuMask & X265_CPU_AVX2)</div><div> {</div><div>+ <a href="http://p.cu">p.cu</a>[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2);</div><div>+ <a href="http://p.cu">p.cu</a>[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2);</div><div>+ <a href="http://p.cu">p.cu</a>[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2);</div><div>+</div><div> <a href="http://p.cu">p.cu</a>[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2);</div><div> <a href="http://p.cu">p.cu</a>[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2);</div><div> <a href="http://p.cu">p.cu</a>[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2);</div><div>diff -r 6e7761bdfe23 -r 73b301b038c8 source/common/x86/ssd-a.asm</div><div>--- a/source/common/x86/ssd-a.asm<span class="" style="white-space:pre"> </span>Wed Sep 30 14:57:15 2015 +0530</div><div>+++ b/source/common/x86/ssd-a.asm<span class="" style="white-space:pre"> </span>Wed Sep 30 11:22:16 2015 +0530</div><div>@@ -1016,8 +1016,171 @@</div><div> SSD_SS_32xN</div><div> SSD_SS_48</div><div> SSD_SS_64xN</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_16x16, 4,4,3</div><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m2, m2</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + r1]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + r3]</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m1</div><div>+ paddd m2, m0</div><div>+</div><div>+ HADDD m2,m0</div><div>+ movd eax, xm2</div><div>+ RET</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_32x32, 4,5,3</div><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m2, m2</div><div>+ mov r4d, 16</div><div>+.loop:</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + mmsize] </div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + mmsize]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m0</div><div>+ paddd m2, m1</div><div>+ movu m0, [r0 + r1]</div><div>+ movu m1, [r0 + r1 + mmsize]</div><div>+ psubw m0, [r2 + r3]</div><div>+ psubw m1, [r2 + r3 + mmsize]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m0</div><div>+ paddd m2, m1</div><div>+ lea r0, [r0 + 2 * r1]</div><div>+ lea r2, [r2 + 2 * r3]</div><div>+ dec r4d</div><div>+ jne .loop</div><div>+</div><div>+ HADDD m2,m0</div><div>+ movd eax, xm2</div><div>+ RET</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_64x64, 4,5,3</div><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m2, m2</div><div>+ mov r4d,64</div><div>+.loop:</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0 + mmsize]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2 + mmsize]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m0</div><div>+ paddd m2, m1</div><div>+ movu m0, [r0 + 2 * mmsize]</div><div>+ movu m1, [r0 + 3 * mmsize]</div><div>+ psubw m0, [r2 + 2 * mmsize]</div><div>+ psubw m1, [r2 + 3 * mmsize]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m2, m0</div><div>+ paddd m2, m1</div><div>+</div><div>+ add r0, r1</div><div>+ add r2, r3</div><div>+</div><div>+ dec r4d</div><div>+ jne .loop</div><div>+</div><div>+ HADDD m2,m0</div><div>+ movd eax, xm2</div><div>+ RET</div><div>+</div><div> %endif ; !HIGH_BIT_DEPTH</div><div>-</div><div> %if HIGH_BIT_DEPTH == 0</div><div> %macro SSD_LOAD_FULL 5</div><div> movu m1, [t0+%1]</div><div><br></div></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Thu, Oct 1, 2015 at 5:27 PM, Ramya Sriraman <span dir="ltr"><<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><span class=""><div># HG changeset patch</div><div># User Ramya Sriraman <<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>></div><div># Date 1443592336 -19800</div><div># Wed Sep 30 11:22:16 2015 +0530</div></span><div># Node ID f56066fbfc4de2deb969d65efdb9045f37681808</div><div># Parent 6e7761bdfe23addb862483f8407b388800de7d92</div><span class=""><div>asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2</div><div><br></div></span><div>diff -r 6e7761bdfe23 -r f56066fbfc4d source/common/x86/asm-primitives.cpp</div><div>--- a/source/common/x86/asm-primitives.cpp<span style="white-space:pre-wrap"> </span>Wed Sep 30 14:57:15 2015 +0530</div><div>+++ b/source/common/x86/asm-primitives.cpp<span style="white-space:pre-wrap"> </span>Wed Sep 30 11:22:16 2015 +0530</div><div>@@ -2677,6 +2677,10 @@</div><span class=""><div> #if X86_64</div><div> if (cpuMask & X265_CPU_AVX2)</div><div> {</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2);</div><div>+ <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2);</div><div>+</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2);</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2);</div><div> <a href="http://p.cu" target="_blank">p.cu</a>[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2);</div></span><div>diff -r 6e7761bdfe23 -r f56066fbfc4d source/common/x86/ssd-a.asm</div><div>--- a/source/common/x86/ssd-a.asm<span style="white-space:pre-wrap"> </span>Wed Sep 30 14:57:15 2015 +0530</div><span class=""><div>+++ b/source/common/x86/ssd-a.asm<span style="white-space:pre-wrap"> </span>Wed Sep 30 11:22:16 2015 +0530</div></span><div>@@ -1016,8 +1016,175 @@</div><span class=""><div> SSD_SS_32xN</div><div> SSD_SS_48</div><div> SSD_SS_64xN</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_16x16, 4,4,5</div><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m4, m4</div><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><div>+ paddd m0 , m1</div><div>+ paddd m4, m0</div></span><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+r1]</div></span><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+r3]</div><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><div>+ paddd m0, m1</div><div>+ paddd m4, m0</div><span class=""><div>+</div><div>+ HADDD m4,m0</div><div>+ movd eax, xm4</div><div>+ RET</div><div>+</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_32x32, 4,5,5</div></span><span class=""><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m4, m4</div></span><div>+ mov r4d, 16</div><div>+.loop:</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+mmsize] </div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+mmsize]</div><span class=""><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><span class=""><div>+ paddd m4, m0</div><div>+ paddd m4, m1</div><div>+ movu m0, [r0+r1]</div><div>+ movu m1, [r0+r1+mmsize]</div><div>+ movu m2, [r2+r3]</div><div>+ movu m3, [r2+r3+mmsize]</div><div>+ psubw m0, m2</div></span><div>+ psubw m1, m3</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><span class=""><div>+ paddd m4, m0</div><div>+ paddd m4, m1</div></span><span class=""><div>+ lea r0, [r0+2*r1]</div><div>+ lea r2, [r2+2*r3]</div></span><span class=""><div>+ dec r4d</div><div>+ jne .loop</div><div>+</div><div>+ HADDD m4,m0</div><div>+ movd eax, xm4</div><div>+ RET</div><div>+INIT_YMM avx2</div><div>+cglobal pixel_ssd_ss_64x64, 4,5,5</div></span><span class=""><div>+ add r1d, r1d</div><div>+ add r3d, r3d</div><div>+ pxor m4, m4</div></span><div>+ mov r4d,64</div><div>+.loop:</div><div>+ movu m0, [r0]</div><div>+ movu m1, [r0+mmsize]</div><div>+ psubw m0, [r2]</div><div>+ psubw m1, [r2+mmsize]</div><span class=""><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div></span><span class=""><div>+ paddd m4, m0</div><div>+ paddd m4, m1</div><div>+ movu m0, [r0+2*mmsize]</div><div>+ movu m1, [r0+3*mmsize]</div><div>+ movu m2, [r2+2*mmsize]</div><div>+ movu m3, [r2+3*mmsize]</div><div>+ psubw m0, m2</div></span><div>+ psubw m1, m3</div><div>+ pmaddwd m0, m0</div><div>+ pmaddwd m1, m1</div><span class=""><div>+ paddd m4, m0</div><div>+ paddd m4, m1</div><div>+</div><div>+ add r0, r1</div><div>+ add r2, r3</div><div>+</div><div>+ dec r4d</div><div>+ jne .loop</div><div>+</div><div>+ HADDD m4,m0</div><div>+ movd eax, xm4</div><div>+ RET</div><div>+</div><div> %endif ; !HIGH_BIT_DEPTH</div><div>-</div><div> %if HIGH_BIT_DEPTH == 0</div><div> %macro SSD_LOAD_FULL 5</div><div> movu m1, [t0+%1]</div><div><br></div></span></div><div class="gmail_extra"><br clear="all"><div><div><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote"><div><div class="h5">On Wed, Sep 30, 2015 at 8:29 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br></div></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div><div class="h5"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div><br></div><pre><span><br>At 2015-09-30 13:53:18,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:
># HG changeset patch
># User Ramya Sriraman <a href="mailto:ramya@multicorewareinc.com%3E%3E#%C2%A0Date%C2%A01443592336%C2%A0-19800%3E%23%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0Wed%C2%A0Sep%C2%A030%C2%A011:22:16%C2%A02015%C2%A0+0530%3E%23%C2%A0Node%C2%A0ID%C2%A029b61906162c657da241aecee9012e3f2da34b6d%3E%23%C2%A0Parent%C2%A0%C2%A05f1451e5842252b31442e8b6519138d8033bbb2b%3Easm:%C2%A0Add%C2%A0sse_ss%C2%A0for%C2%A0[16x16],[32x32]%C2%A0&%C2%A0[64x64]%C2%A0for%C2%A08bpp%C2%A0avx2%3E" target="_blank">ramya@multicorewareinc.com>
># Date 1443592336 -19800
># Wed Sep 30 11:22:16 2015 +0530
># Node ID 29b61906162c657da241aecee9012e3f2da34b6d
># Parent 5f1451e5842252b31442e8b6519138d8033bbb2b
>asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2
>
</a></span>diff -r 5f1451e58422 -r 29b61906162c source/common/x86/ssd-a.asm
>--- a/source/common/x86/ssd-a.asm Mon Sep 28 16:43:47 2015 +0530
>+++ b/source/common/x86/ssd-a.asm Wed Sep 30 11:22:16 2015 +0530
>@@ -1100,8 +1100,195 @@
> SSD_SS_32xN
> SSD_SS_48
> SSD_SS_64xN
>+
>+INIT_YMM avx2
>+cglobal pixel_ssd_ss_16x16, 4,4,5
>+ add r1d, r1d
>+ add r3d, r3d
>+ pxor m4, m4
>+
>+ movu m0, [r0]
>+ movu m1, [r0+r1]
>+ movu m2, [r2]
>+ movu m3, [r2+r3]
>+ psubw m0, m2<br>in avx2, vpsubw can work on unaligned address<span><br>
>+ psubw m1, m3
>+ lea r0, [r0+2*r1]
>+ lea r2, [r2+2*r3]
>+ pmaddwd m0, m0
>+ pmaddwd m1, m1
>+ paddd m0 , m1
>+ paddd m4, m0
</span></pre><pre><br></pre></div><br></div></div>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>
</blockquote></div><br></div>