[x265] [PATCH] asm: avx2 version cvt16to32_shr[]

Steve Borho steve at borho.org
Wed Aug 27 21:53:33 CEST 2014


On 08/27, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1409166976 25200
> # Node ID 6cdcf1a7fa9803898e8f04818865cc150db250ea
> # Parent  77fe0cc583e8ec10275bc1b3c4bb116d5ceb51ac
> asm: avx2 version cvt16to32_shr[]
> 
> 4x4      135c ->  105c
> 8x8      375c ->  233c (unroll 228c)
> 16x16   1333c ->  816c
> 32x32   5278c -> 2690c

This is failing tests on my Mac:

Testing primitives: AVX2
cvt16to32_shr failed!

> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Wed Aug 27 12:16:16 2014 -0700
> @@ -1714,6 +1714,10 @@
>          p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
>          p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
>          p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> +        p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_avx2;
> +        p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_avx2;
> +        p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_avx2;
> +        p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_avx2;
>          p.denoiseDct = x265_denoise_dct_avx2;
>      }
>  #endif // if HIGH_BIT_DEPTH
> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm	Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm	Wed Aug 27 12:16:16 2014 -0700
> @@ -3437,6 +3437,38 @@
>      RET
>  
>  
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_4, 3,3,4
> +    add             r2d, r2d
> +    movd            xm0, r3m
> +    vpbroadcastd    m1, r4m
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - stride
> +    ; m0 - shift
> +    ; m1 - dword [offset]
> +
> +    ; Row 0-1
> +    pmovsxwd        xm2, [r1]
> +    pmovsxwd        xm3, [r1 + r2]
> +    vinserti128     m2, m2, xm3, 1
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +
> +    ; Row 2-3
> +    lea             r1, [r1 + r2 * 2]
> +    pmovsxwd        xm2, [r1]
> +    pmovsxwd        xm3, [r1 + r2]
> +    vinserti128     m2, m2, xm3, 1
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 1 * mmsize], m2
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
>  ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
>  ;--------------------------------------------------------------------------------------
> @@ -3506,6 +3538,55 @@
>      RET
>  
>  
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_8, 3,5,3
> +    add             r2d, r2d
> +    movd            xm0, r3m
> +    vpbroadcastd    m1, r4m
> +    mov             r3d, 8/4
> +    lea             r4, [r2 * 3]
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - stride
> +    ; r3 - loop counter
> +    ; r4 - stride * 3
> +    ; m0 - shift
> +    ; m1 - dword [offset]
> +
> +.loop:
> +    ; Row 0
> +    pmovsxwd        m2, [r1]
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +
> +    ; Row 1
> +    pmovsxwd        m2, [r1 + r2]
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 1 * mmsize], m2
> +
> +    ; Row 2
> +    pmovsxwd        m2, [r1 + r2 * 2]
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 2 * mmsize], m2
> +
> +    ; Row 3
> +    pmovsxwd        m2, [r1 + r4]
> +    paddd           m2, m1
> +    psrad           m2, xm0
> +    movu            [r0 + 3 * mmsize], m2
> +
> +    add             r0, 4 * mmsize
> +    lea             r1, [r1 + r2 * 4]
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
>  ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
>  ;--------------------------------------------------------------------------------------
> @@ -3569,6 +3650,72 @@
>      RET
>  
>  
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_16, 3,5,4
> +    add             r2d, r2d
> +    movd            xm0, r3m
> +    vpbroadcastd    m1, r4m
> +    mov             r3d, 16/4
> +    lea             r4, [r2 * 3]
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - stride
> +    ; r3 - loop counter
> +    ; m0 - shift
> +    ; m1 - dword [offset]
> +
> +.loop:
> +    ; Row 0
> +    pmovsxwd        m2, [r1 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + 1 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +    movu            [r0 + 1 * mmsize], m3
> +
> +    ; Row 1
> +    pmovsxwd        m2, [r1 + r2 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + r2 + 1 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    movu            [r0 + 2 * mmsize], m2
> +    movu            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
> +
> +    ; Row 2
> +    pmovsxwd        m2, [r1 + r2 * 2 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + r2 * 2 + 1 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +    movu            [r0 + 1 * mmsize], m3
> +
> +    ; Row 3
> +    pmovsxwd        m2, [r1 + r4 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + r4 + 1 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    movu            [r0 + 2 * mmsize], m2
> +    movu            [r0 + 3 * mmsize], m3
> +
> +    add             r0, 4 * mmsize
> +    lea             r1, [r1 + r2 * 4]
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
>  ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
>  ;--------------------------------------------------------------------------------------
> @@ -3631,6 +3778,66 @@
>      RET
>  
>  
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_32, 3,4,6
> +    add             r2d, r2d
> +    movd            xm0, r3m
> +    vpbroadcastd    m1, r4m
> +    mov             r3d, 32/2
> +
> +    ; register alloc
> +    ; r0 - dst
> +    ; r1 - src
> +    ; r2 - stride
> +    ; r3 - loop counter
> +    ; m0 - shift
> +    ; m1 - dword [offset]
> +
> +.loop:
> +    ; Row 0
> +    pmovsxwd        m2, [r1 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + 1 * mmsize/2]
> +    pmovsxwd        m4, [r1 + 2 * mmsize/2]
> +    pmovsxwd        m5, [r1 + 3 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    paddd           m4, m1
> +    paddd           m5, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    psrad           m4, xm0
> +    psrad           m5, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +    movu            [r0 + 1 * mmsize], m3
> +    movu            [r0 + 2 * mmsize], m4
> +    movu            [r0 + 3 * mmsize], m5
> +    add             r0, 4 * mmsize
> +
> +    ; Row 1
> +    pmovsxwd        m2, [r1 + r2 + 0 * mmsize/2]
> +    pmovsxwd        m3, [r1 + r2 + 1 * mmsize/2]
> +    pmovsxwd        m4, [r1 + r2 + 2 * mmsize/2]
> +    pmovsxwd        m5, [r1 + r2 + 3 * mmsize/2]
> +    paddd           m2, m1
> +    paddd           m3, m1
> +    paddd           m4, m1
> +    paddd           m5, m1
> +    psrad           m2, xm0
> +    psrad           m3, xm0
> +    psrad           m4, xm0
> +    psrad           m5, xm0
> +    movu            [r0 + 0 * mmsize], m2
> +    movu            [r0 + 1 * mmsize], m3
> +    movu            [r0 + 2 * mmsize], m4
> +    movu            [r0 + 3 * mmsize], m5
> +    add             r0, 4 * mmsize
> +
> +    lea             r1, [r1 + r2 * 2]
> +    dec             r3d
> +    jnz            .loop
> +    RET
> +
> +
>  ;--------------------------------------------------------------------------------------
>  ; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
>  ;--------------------------------------------------------------------------------------
> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h	Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/blockcopy8.h	Wed Aug 27 12:16:16 2014 -0700
> @@ -38,6 +38,10 @@
>  void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_4_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_8_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_16_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_32_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
>  uint32_t x265_cvt16to32_cnt_4_sse4(int32_t * dst, int16_t * src, intptr_t);
>  uint32_t x265_cvt16to32_cnt_8_sse4(int32_t * dst, int16_t * src, intptr_t);
>  uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list