<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>+%macro PROCESS_SAD_12x4 0<BR>+    movu    m1,  [r2]<BR>+    movu    m2,  [r0]<BR>+    pand    m1,  m4<BR>+    pand    m2,  m4<BR>+    psadbw  m1,  m2<BR>+    paddd   m0,  m1<BR>+    lea     r2,  [r2 + r3]<BR>+    lea     r0,  [r0 + r1]<BR>+    movu    m1,  [r2]<BR>+    movu    m2,  [r0]<BR>+    pand    m1,  m4<BR>+    pand    m2,  m4<BR>+    psadbw  m1,  m2<BR>+    paddd   m0,  m1</DIV>

<BLOCKQUOTE id="isReplyContent" style="PADDING-LEFT: 1ex; MARGIN: 0px 0px 0px 0.8ex; BORDER-LEFT: #ccc 1px solid">

<DIV dir="ltr">

<DIV class="gmail_quote">>>+    lea     r2,  [r2 + r3]<BR>>>+    lea     r0,  [r0 + r1]</DIV>

<DIV class="gmail_quote">>>+    movu    m1,  [r2]<BR>>>+    movu    m2,  [r0]</DIV>

<DIV class="gmail_quote"><BR></DIV>

<DIV class="gmail_quote">

<DIV class="gmail_quote">we don't need to load address every time when we are adding stride to it. we should try to calculate address first using multiply by 1, 2, 4, or 8 if it not the case then we should load it.</DIV>

<DIV class="gmail_quote"> like above four instruction can be replaced with these two only.</DIV>

<DIV class="gmail_quote"><BR></DIV>

<DIV class="gmail_quote">movu    m1,  [r2 + 2 * r3]</DIV>movu    m2,  [r0 + 2 * r1]<BR></DIV>

<DIV class="gmail_quote"><BR>+    pand    m1,  m4<BR>+    pand    m2,  m4<BR>+    psadbw  m1,  m2<BR>+    paddd   m0,  m1<BR>+    lea     r2,  [r2 + r3]<BR>+    lea     r0,  [r0 + r1]<BR>+    movu    m1,  [r2]<BR>+    movu    m2,  [r0]<BR>+    pand    m1,  m4<BR>+    pand    m2,  m4<BR>+    psadbw  m1,  m2<BR>+    paddd   m0,  m1<BR>+%endmacro<BR>+<BR> %macro PROCESS_SAD_16x4 0<BR>     movu    m1,  [r2]<BR>     movu    m2,  [r2 + r3]<BR>@@ -1007,6 +1041,29 @@<BR>     movd    eax, m0<BR>     RET<BR><BR>+;-----------------------------------------------------------------------------<BR>+; int pixel_sad_12x16(

  uint8_t *, intptr_t, uint8_t *, intptr_t )<BR>+;-----------------------------------------------------------------------------<BR>+cglobal pixel_sad_12x16, 4,4,4<BR>+    mova  m4,  [MSK]<BR>+    pxor  m0,  m0<BR>+<BR>+    PROCESS_SAD_12x4<BR>+    lea         r2,  [r2 + r3]<BR>+    lea         r0,  [r0 + r1]<BR>+    PROCESS_SAD_12x4<BR>+    lea         r2,  [r2 + r3]<BR>+    lea         r0,  [r0 + r1]<BR>+    PROCESS_SAD_12x4<BR>+    lea         r2,  [r2 + r3]<BR>+    lea         r0,  [r0 + r1]<BR>+    PROCESS_SAD_12x4<BR>+<BR>+    movhlps m1,  m0<BR>+    paddd   m0,  m1<BR>+    movd    eax, m0<BR>+    RET<BR>+<BR> %endmacro<BR><B

 R></DIV>

<DIV class="gmail_quote">overuse of lea  instruction please eliminate them, use available registers to save loads operations.</DIV><BR>Excuse me, I forgot something, for 12xN, use MOVQ+MOVD is better than MOVU+PAND</DIV></BLOCKQUOTE></div>