[x265] [PATCH] copy_cnt_8 AVX2 asm code, as per new interface

Steve Borho steve at borho.org
Tue Sep 9 15:35:41 CEST 2014


On 09/09, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1410265389 -19800
> # Node ID cbb7e2beff033e441a13cd82bbfd85b362cd6d24
> # Parent  d011073f35258cb2f0ad95db6038c2d9fb840b27
> copy_cnt_8 AVX2 asm code, as per new interface

these commit messages should describe the perf difference from the
previous versions

> diff -r d011073f3525 -r cbb7e2beff03 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm	Tue Sep 09 14:07:14 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm	Tue Sep 09 17:53:09 2014 +0530
> @@ -4076,85 +4076,49 @@
>  
>  
>  INIT_YMM avx2
> -%if ARCH_X86_64 == 1
> -cglobal copy_cnt_8, 3,4,6
> -  %define tmpd eax
> -%else
> -cglobal copy_cnt_8, 3,5,6
> -  %define tmpd r4d
> -%endif
> +cglobal copy_cnt_8, 3,3,6
>      add         r2d, r2d
> -    pxor        m4, m4
> -    lea         r3, [r2 * 3]
> -
> -    ; row 0
> +    xorpd       m5, m5
> +
> +    ; row 0 - 1
>      movu        xm0, [r1]
> -    mova        xm2, xm0
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 0 * mmsize], m1
> -
> -    ; row 1
> -    movu        xm0, [r1 + r2]
> -    vinserti128 m2, m2, xm0, 1
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 1 * mmsize], m1
> -
> -    ; row 2
> -    movu        xm0, [r1 + r2 * 2]
> -    mova        xm5, xm0
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 2 * mmsize], m1
> -
> -    ; row 3
> -    movu        xm0, [r1 + r3]
> -    vinserti128 m5, m5, xm0, 1
> -    packsswb    m2, m5
> -    pcmpeqb     m2, m4
> -    pmovmskb    tmpd, m2
> -    not         tmpd
> -    popcnt      tmpd, tmpd
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 3 * mmsize], m1
> -
> -    add         r0, 4 * mmsize
> -    lea         r1, [r1 + r2 * 4]
> -
> -    ; row 4
> -    movu        xm0, [r1]
> -    mova        xm2, xm0
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 0 * mmsize], m1
> -
> -    ; row 5
> -    movu        xm0, [r1 + r2]
> -    vinserti128 m2, m2, xm0, 1
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 1 * mmsize], m1
> -
> -    ; row 6
> -    movu        xm0, [r1 + r2 * 2]
> -    mova        xm5, xm0
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 2 * mmsize], m1
> -
> -    ; row 7
> -    movu        xm0, [r1 + r3]
> -    pmovsxwd    m1, xm0
> -    movu        [r0 + 3 * mmsize], m1
> -    vinserti128 m5, m5, xm0, 1
> +    movu        xm1, [r1 + r2]
> +    vinserti128 m0, m0, xm1, 1
> +    movu        [r0], m0
> +
> +    ; row 2 - 3
> +    movu        xm1, [r1 + r2 * 2]
> +    lea         r1,  [r1 + r2 * 2]
> +    movu        xm2, [r1 + r2]
> +    vinserti128 m1, m1, xm2, 1
> +    movu        [r0 + 32], m1
> +
> +    ; row 4 - 5
> +    movu        xm2, [r1 + r2 * 2]
> +    lea         r1,  [r1 + r2 * 2]
> +    movu        xm3, [r1 + r2]
> +    vinserti128 m2, m2, xm3, 1
> +    movu        [r0 + 64], m2
> +
> +    ; row 6 - 7
> +    movu        xm3, [r1 + r2 * 2]
> +    lea         r1,  [r1 + r2 * 2]
> +    movu        xm4, [r1 + r2]
> +    vinserti128 m3, m3, xm4, 1
> +    movu        [r0 + 96], m3
>  
>      ; get count
> -    packsswb    m2, m5
> -    pcmpeqb     m2, m4
> -    pmovmskb    r0d, m2
> -    not         r0d
> -    popcnt      r0d, r0d
> -
> -%if ARCH_X86_64 == 1
> -    add         tmpd, r0d
> -%else
> -    add         r0d, tmpd
> -%endif
> +    vpacksswb    m0, m1
> +    vpcmpeqb     m0, m5
> +    vpmovmskb    eax, m0
> +    not          eax
> +    popcnt       eax, eax
> +    vpacksswb    m2, m3
> +    vpcmpeqb     m2, m5
> +    vpmovmskb    r1d, m2
> +    not          r1d
> +    popcnt       r1d, r1d
> +    add          eax, r1d
>      RET
>  
>  
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list