[x264-devel] Re: [patch] faster SAD_INC_2x16P ported to amd64

Josef Zlomek josef.zlomek at xeris.cz
Wed Jul 13 15:29:12 CEST 2005


> the attached patch ports the recently committed patch with
> faster SAD_INC_2x16P to amd64 (x86_64) architecture.

Please, can somebody apply this patch?
The patch improves overall performance of the codec by about 1%
and makes the i386 and amd64 ports equivalent again, which
simplifies maintenance.

Josef Zlomek

> Index: common/amd64/pixel-a.asm
> ===================================================================
> --- common/amd64/pixel-a.asm	(revision 272)
> +++ common/amd64/pixel-a.asm	(working copy)
> @@ -38,27 +38,19 @@
>  
>  %macro SAD_INC_2x16P 0
>      movq    mm1,    [rax]
> -    movq    mm2,    [rcx]
> -    movq    mm3,    [rax+8]
> -    movq    mm4,    [rcx+8]
> -
> -    psadbw  mm1,    mm2
> -    psadbw  mm3,    mm4
> -    paddw   mm0,    mm1
> -    paddw   mm0,    mm3
> -
> -    movq    mm1,    [rax+rbx]
> -    movq    mm2,    [rcx+rdx]
> -    movq    mm3,    [rax+rbx+8]
> -    movq    mm4,    [rcx+rdx+8]
> -
> -    psadbw  mm1,    mm2
> -    psadbw  mm3,    mm4
> -    paddw   mm0,    mm1
> -    paddw   mm0,    mm3
> -
> +    movq    mm2,    [rax+8]
> +    movq    mm3,    [rax+rbx]
> +    movq    mm4,    [rax+rbx+8]
> +    psadbw  mm1,    [rcx]
> +    psadbw  mm2,    [rcx+8]
> +    psadbw  mm3,    [rcx+rdx]
> +    psadbw  mm4,    [rcx+rdx+8]
>      lea     rax,    [rax+2*rbx]
> +    paddw   mm1,    mm2
> +    paddw   mm3,    mm4
>      lea     rcx,    [rcx+2*rdx]
> +    paddw   mm0,    mm1
> +    paddw   mm0,    mm3
>  %endmacro
>  
>  %macro SAD_INC_2x8P 0


-- 
Josef Zlomek
josef.zlomek at email.cz
zlomj9am at artax.karlin.mff.cuni.cz
http://zlomek.matfyz.cz/
ICQ: 152422432
GPG fingerprint: 74E6 31D3 56D7 91FD 5A06  6BD5 96FF 99C4 25C0 EC0B

-- 
This is the x264-devel mailing-list
To unsubscribe, go to: http://developers.videolan.org/lists.html



More information about the x264-devel mailing list