<div dir="ltr"><div class="gmail_default" style="font-family:trebuchet ms,sans-serif;font-size:small">Thanks Praveen. Will correct the message and resend.</div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Apr 16, 2015 at 4:06 PM, Praveen Tiwari <span dir="ltr"><<a href="mailto:praveen@multicorewareinc.com" target="_blank">praveen@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div dir="ltr"><br clear="all"><div><div><div dir="ltr">Regards,<div>Praveen</div></div></div></div><span class="">
<br><div class="gmail_quote">---------- Forwarded message ----------<br>From: <b class="gmail_sendername"></b> <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span><br>Date: Thu, Apr 16, 2015 at 3:43 PM<br>Subject: [x265] [PATCH] asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE<br>To: <a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br><br><br># HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1429173485 -19800<br>
#      Thu Apr 16 14:08:05 2015 +0530<br>
# Node ID 04e7526a8bde9e46867f5c4cfb63b98409c7fb44<br>
# Parent  ebca2a0d3ab905b62c346d5d0b23d50c618d5827<br>
asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE<br>
<br>
AVX2:<br>
satd[48x64]        12.52x   7696.91         96366.03<br>
satd[64x48]        12.16x   8103.43         98564.64<br>
satd[64x16]        12.15x   2759.65         33537.19<br>
satd[64x32]        12.12x   5372.52         65090.38<br>
satd[64x64]        13.02x   10260.38        133615.69<br>
<br>
SSE:<br>
satd[48x64]        5.32x    18146.13        96505.38<br>
satd[64x48]        5.33x    18201.03        96975.23<br>
satd[64x16]        5.21x    6272.14         32651.24<br>
satd[64x32]        5.42x    11910.58        64529.81<br>
satd[64x64]        5.30x    26665.73        141387.59</div><div class="gmail_quote"><br></div><div class="gmail_quote"><br></div></span><div class="gmail_quote">Please, correct the commit messages 100% improvement  means your new code is taking zero cycles. % improvement is calculated as below</div><div class="gmail_quote"><br></div><div class="gmail_quote">(old cycles - new cycles) * 100 / old cycles.</div><div class="gmail_quote"><br></div><div class="gmail_quote">so 48x64, (18146 - 7696) * 100 / 18146 = ~57%</div><div class="gmail_quote"><br></div><div class="gmail_quote">Please do correct previous commit messages also if you have done same thing.    </div><div class="gmail_quote"><br></div><div class="gmail_quote"><br></div><div class="gmail_quote"><div><div class="h5"><br>
<br>
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp      Thu Apr 16 12:22:53 2015 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp      Thu Apr 16 14:08:05 2015 +0530<br>
@@ -1707,6 +1707,11 @@<br>
         p.pu[LUMA_32x24].satd   = x265_pixel_satd_32x24_avx2;<br>
         p.pu[LUMA_32x32].satd   = x265_pixel_satd_32x32_avx2;<br>
         p.pu[LUMA_32x64].satd   = x265_pixel_satd_32x64_avx2;<br>
+        p.pu[LUMA_48x64].satd   = x265_pixel_satd_48x64_avx2;<br>
+        p.pu[LUMA_64x16].satd   = x265_pixel_satd_64x16_avx2;<br>
+        p.pu[LUMA_64x32].satd   = x265_pixel_satd_64x32_avx2;<br>
+        p.pu[LUMA_64x48].satd   = x265_pixel_satd_64x48_avx2;<br>
+        p.pu[LUMA_64x64].satd   = x265_pixel_satd_64x64_avx2;<br>
<br>
         p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;<br>
         p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;<br>
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/pixel-a.asm<br>
--- a/source/common/x86/pixel-a.asm     Thu Apr 16 12:22:53 2015 +0530<br>
+++ b/source/common/x86/pixel-a.asm     Thu Apr 16 14:08:05 2015 +0530<br>
@@ -10903,4 +10903,279 @@<br>
     movd            eax, xm0<br>
     RET<br>
<br>
+cglobal pixel_satd_48x64, 4,8,10        ; if WIN64 && cpuflag(avx2)<br>
+    mova            m7, [hmul_16p]<br>
+    lea             r4, [3 * r1]<br>
+    lea             r5, [3 * r3]<br>
+    pxor            m6, m6<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 16]<br>
+    lea             r2, [r7 + 16]<br>
+    mova            m9, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m9, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 32]<br>
+    lea             r2, [r7 + 32]<br>
+    mova            m8, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m8, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+<br>
+    pmaddwd         m6, [pw_1]<br>
+    vextracti128    xm2, m9, 1<br>
+    vextracti128    xm1, m8, 1<br>
+    vextracti128    xm0, m6, 1<br>
+    paddd           xm2, xm9<br>
+    paddd           xm1, xm8<br>
+    paddd           xm0, xm6<br>
+    paddd           xm0, xm1<br>
+    paddd           xm0, xm2<br>
+    movhlps         xm7, xm0<br>
+    paddd           xm0, xm7<br>
+    pshuflw         xm7, xm0, q0032<br>
+    paddd           xm0, xm7<br>
+    movd            eax, xm0<br>
+    RET<br>
+<br>
+cglobal pixel_satd_64x16, 4,8,8         ; if WIN64 && cpuflag(avx2)<br>
+    mova            m7, [hmul_16p]<br>
+    lea             r4, [3 * r1]<br>
+    lea             r5, [3 * r3]<br>
+    pxor            m6, m6<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 16]<br>
+    lea             r2, [r7 + 16]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 32]<br>
+    lea             r2, [r7 + 32]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 48]<br>
+    lea             r2, [r7 + 48]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+<br>
+    vextracti128    xm0, m6, 1<br>
+    paddw           xm0, xm6<br>
+    pmaddwd         xm0, [pw_1]<br>
+    movhlps         xm7, xm0<br>
+    paddd           xm0, xm7<br>
+    pshuflw         xm7, xm0, q0032<br>
+    paddd           xm0, xm7<br>
+    movd            eax, xm0<br>
+    RET<br>
+<br>
+cglobal pixel_satd_64x32, 4,8,9         ; if WIN64 && cpuflag(avx2)<br>
+    mova            m7, [hmul_16p]<br>
+    lea             r4, [3 * r1]<br>
+    lea             r5, [3 * r3]<br>
+    pxor            m6, m6<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 16]<br>
+    lea             r2, [r7 + 16]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 32]<br>
+    lea             r2, [r7 + 32]<br>
+    mova            m8, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m8, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 48]<br>
+    lea             r2, [r7 + 48]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+<br>
+    pmaddwd         m6, [pw_1]<br>
+    vextracti128    xm1, m8, 1<br>
+    vextracti128    xm0, m6, 1<br>
+    paddd           xm1, xm8<br>
+    paddd           xm0, xm6<br>
+    paddd           xm0, xm1<br>
+    movhlps         xm7, xm0<br>
+    paddd           xm0, xm7<br>
+    pshuflw         xm7, xm0, q0032<br>
+    paddd           xm0, xm7<br>
+    movd            eax, xm0<br>
+    RET<br>
+<br>
+cglobal pixel_satd_64x48, 4,8,10        ; if WIN64 && cpuflag(avx2)<br>
+    mova            m7, [hmul_16p]<br>
+    lea             r4, [3 * r1]<br>
+    lea             r5, [3 * r3]<br>
+    pxor            m6, m6<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 16]<br>
+    lea             r2, [r7 + 16]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    mova            m8, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m8, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 32]<br>
+    lea             r2, [r7 + 32]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    mova            m9, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m9, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 48]<br>
+    lea             r2, [r7 + 48]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+<br>
+    pmaddwd         m6, [pw_1]<br>
+    vextracti128    xm2, m9, 1<br>
+    vextracti128    xm1, m8, 1<br>
+    vextracti128    xm0, m6, 1<br>
+    paddd           xm2, xm9<br>
+    paddd           xm1, xm8<br>
+    paddd           xm0, xm6<br>
+    paddd           xm0, xm2<br>
+    paddd           xm0, xm1<br>
+    movhlps         xm7, xm0<br>
+    paddd           xm0, xm7<br>
+    pshuflw         xm7, xm0, q0032<br>
+    paddd           xm0, xm7<br>
+    movd            eax, xm0<br>
+    RET<br>
+<br>
+cglobal pixel_satd_64x64, 4,8,11        ; if WIN64 && cpuflag(avx2)<br>
+    mova            m7, [hmul_16p]<br>
+    lea             r4, [3 * r1]<br>
+    lea             r5, [3 * r3]<br>
+    pxor            m6, m6<br>
+    mov             r6, r0<br>
+    mov             r7, r2<br>
+<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 16]<br>
+    lea             r2, [r7 + 16]<br>
+    mova            m10, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m10, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 32]<br>
+    lea             r2, [r7 + 32]<br>
+    mova            m9, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m9, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    lea             r0, [r6 + 48]<br>
+    lea             r2, [r7 + 48]<br>
+    mova            m8, m6              ; to avoid overflow, move to another register<br>
+    pxor            m6, m6<br>
+    pmaddwd         m8, [pw_1]<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+    call            calc_satd_16x8<br>
+<br>
+    pmaddwd         m6, [pw_1]<br>
+    vextracti128    xm3, m10, 1<br>
+    vextracti128    xm2, m9, 1<br>
+    vextracti128    xm1, m8, 1<br>
+    vextracti128    xm0, m6, 1<br>
+    paddd           xm3, xm10<br>
+    paddd           xm2, xm9<br>
+    paddd           xm1, xm8<br>
+    paddd           xm0, xm6<br>
+    paddd           xm0, xm3<br>
+    paddd           xm0, xm2<br>
+    paddd           xm0, xm1<br>
+    movhlps         xm7, xm0<br>
+    paddd           xm0, xm7<br>
+    pshuflw         xm7, xm0, q0032<br>
+    paddd           xm0, xm7<br>
+    movd            eax, xm0<br>
+    RET<br>
+<br>
 %endif  ; if ARCH_X86_64 == 1<br></div></div>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</div><br></div>
<br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>