[x265] [PATCH] asm: assembly code for pixel_sad_12x16

Wed Oct 30 19:54:12 CET 2013

On Wed, Oct 30, 2013 at 11:03 AM, chen <chenm003 at 163.com> wrote:

> +%macro PROCESS_SAD_12x4 0
> +    movu    m1,  [r2]
> +    movu    m2,  [r0]
> +    pand    m1,  m4
> +    pand    m2,  m4
> +    psadbw  m1,  m2
> +    paddd   m0,  m1
> +    lea     r2,  [r2 + r3]
> +    lea     r0,  [r0 + r1]
> +    movu    m1,  [r2]
> +    movu    m2,  [r0]
> +    pand    m1,  m4
> +    pand    m2,  m4
> +    psadbw  m1,  m2
> +    paddd   m0,  m1
>
>  >>+    lea     r2,  [r2 + r3]
> >>+    lea     r0,  [r0 + r1]
> >>+    movu    m1,  [r2]
> >>+    movu    m2,  [r0]
>
>  we don't need to load address every time when we are adding stride to
> it. we should try to calculate address first using multiply by 1, 2, 4, or
> 8 if it not the case then we should load it.
>  like above four instruction can be replaced with these two only.
>
> movu    m1,  [r2 + 2 * r3]
> movu    m2,  [r0 + 2 * r1]
>
> +    pand    m1,  m4
> +    pand    m2,  m4
> +    psadbw  m1,  m2
> +    paddd   m0,  m1
> +    lea     r2,  [r2 + r3]
> +    lea     r0,  [r0 + r1]
> +    movu    m1,  [r2]
> +    movu    m2,  [r0]
> +    pand    m1,  m4
> +    pand    m2,  m4
> +    psadbw  m1,  m2
> +    paddd   m0,  m1
> +%endmacro
> +
>  %macro PROCESS_SAD_16x4 0
>      movu    m1,  [r2]
>      movu    m2,  [r2 + r3]
> @@ -1007,6 +1041,29 @@
>      movd    eax, m0
>      RET
>
>
> +;-----------------------------------------------------------------------------
> +; int pixel_sad_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
>
> +;-----------------------------------------------------------------------------
> +cglobal pixel_sad_12x16, 4,4,4
> +    mova  m4,  [MSK]
> +    pxor  m0,  m0
> +
> +    PROCESS_SAD_12x4
> +    lea         r2,  [r2 + r3]
> +    lea         r0,  [r0 + r1]
> +    PROCESS_SAD_12x4
> +    lea         r2,  [r2 + r3]
> +    lea         r0,  [r0 + r1]
> +    PROCESS_SAD_12x4
> +    lea         r2,  [r2 + r3]
> +    lea         r0,  [r0 + r1]
> +    PROCESS_SAD_12x4
> +
> +    movhlps m1,  m0
> +    paddd   m0,  m1
> +    movd    eax, m0
> +    RET
> +
>  %endmacro
> **
> overuse of lea  instruction please eliminate them, use available registers
> to save loads operations.
>
> Excuse me, I forgot something, for 12xN, use MOVQ+MOVD is better than
> MOVU+PAND
>
>

I've queued all of these changes for the default branch since they are
already faster than the intrinsics and this allows us to remove quite a
number of them.  Further optimizations should be done based on these that
are applied.

-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131030/da124203/attachment-0001.html>