[x265] [PATCH] asm: avx2 version cvt16to32_shr[]
Steve Borho
steve at borho.org
Wed Aug 27 21:53:33 CEST 2014
On 08/27, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1409166976 25200
> # Node ID 6cdcf1a7fa9803898e8f04818865cc150db250ea
> # Parent 77fe0cc583e8ec10275bc1b3c4bb116d5ceb51ac
> asm: avx2 version cvt16to32_shr[]
>
> 4x4 135c -> 105c
> 8x8 375c -> 233c (unroll 228c)
> 16x16 1333c -> 816c
> 32x32 5278c -> 2690c
This is failing tests on my Mac:
Testing primitives: AVX2
cvt16to32_shr failed!
> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 12:16:16 2014 -0700
> @@ -1714,6 +1714,10 @@
> p.cvt32to16_shl[BLOCK_8x8] = x265_cvt32to16_shl_8_avx2;
> p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
> p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
> + p.cvt16to32_shr[BLOCK_4x4] = x265_cvt16to32_shr_4_avx2;
> + p.cvt16to32_shr[BLOCK_8x8] = x265_cvt16to32_shr_8_avx2;
> + p.cvt16to32_shr[BLOCK_16x16] = x265_cvt16to32_shr_16_avx2;
> + p.cvt16to32_shr[BLOCK_32x32] = x265_cvt16to32_shr_32_avx2;
> p.denoiseDct = x265_denoise_dct_avx2;
> }
> #endif // if HIGH_BIT_DEPTH
> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.asm
> --- a/source/common/x86/blockcopy8.asm Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/blockcopy8.asm Wed Aug 27 12:16:16 2014 -0700
> @@ -3437,6 +3437,38 @@
> RET
>
>
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_4, 3,3,4
> + add r2d, r2d
> + movd xm0, r3m
> + vpbroadcastd m1, r4m
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - stride
> + ; m0 - shift
> + ; m1 - dword [offset]
> +
> + ; Row 0-1
> + pmovsxwd xm2, [r1]
> + pmovsxwd xm3, [r1 + r2]
> + vinserti128 m2, m2, xm3, 1
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 0 * mmsize], m2
> +
> + ; Row 2-3
> + lea r1, [r1 + r2 * 2]
> + pmovsxwd xm2, [r1]
> + pmovsxwd xm3, [r1 + r2]
> + vinserti128 m2, m2, xm3, 1
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 1 * mmsize], m2
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> ;--------------------------------------------------------------------------------------
> @@ -3506,6 +3538,55 @@
> RET
>
>
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_8, 3,5,3
> + add r2d, r2d
> + movd xm0, r3m
> + vpbroadcastd m1, r4m
> + mov r3d, 8/4
> + lea r4, [r2 * 3]
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - stride
> + ; r3 - loop counter
> + ; r4 - stride * 3
> + ; m0 - shift
> + ; m1 - dword [offset]
> +
> +.loop:
> + ; Row 0
> + pmovsxwd m2, [r1]
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 0 * mmsize], m2
> +
> + ; Row 1
> + pmovsxwd m2, [r1 + r2]
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 1 * mmsize], m2
> +
> + ; Row 2
> + pmovsxwd m2, [r1 + r2 * 2]
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 2 * mmsize], m2
> +
> + ; Row 3
> + pmovsxwd m2, [r1 + r4]
> + paddd m2, m1
> + psrad m2, xm0
> + movu [r0 + 3 * mmsize], m2
> +
> + add r0, 4 * mmsize
> + lea r1, [r1 + r2 * 4]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> ;--------------------------------------------------------------------------------------
> @@ -3569,6 +3650,72 @@
> RET
>
>
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_16, 3,5,4
> + add r2d, r2d
> + movd xm0, r3m
> + vpbroadcastd m1, r4m
> + mov r3d, 16/4
> + lea r4, [r2 * 3]
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - stride
> + ; r3 - loop counter
> + ; m0 - shift
> + ; m1 - dword [offset]
> +
> +.loop:
> + ; Row 0
> + pmovsxwd m2, [r1 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + 1 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + movu [r0 + 0 * mmsize], m2
> + movu [r0 + 1 * mmsize], m3
> +
> + ; Row 1
> + pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + r2 + 1 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + movu [r0 + 2 * mmsize], m2
> + movu [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> +
> + ; Row 2
> + pmovsxwd m2, [r1 + r2 * 2 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + r2 * 2 + 1 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + movu [r0 + 0 * mmsize], m2
> + movu [r0 + 1 * mmsize], m3
> +
> + ; Row 3
> + pmovsxwd m2, [r1 + r4 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + r4 + 1 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + movu [r0 + 2 * mmsize], m2
> + movu [r0 + 3 * mmsize], m3
> +
> + add r0, 4 * mmsize
> + lea r1, [r1 + r2 * 4]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> ; void cvt16to32_shr(int32_t *dst, int16_t *src, intptr_t stride, int shift, int offset);
> ;--------------------------------------------------------------------------------------
> @@ -3631,6 +3778,66 @@
> RET
>
>
> +INIT_YMM avx2
> +cglobal cvt16to32_shr_32, 3,4,6
> + add r2d, r2d
> + movd xm0, r3m
> + vpbroadcastd m1, r4m
> + mov r3d, 32/2
> +
> + ; register alloc
> + ; r0 - dst
> + ; r1 - src
> + ; r2 - stride
> + ; r3 - loop counter
> + ; m0 - shift
> + ; m1 - dword [offset]
> +
> +.loop:
> + ; Row 0
> + pmovsxwd m2, [r1 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + 1 * mmsize/2]
> + pmovsxwd m4, [r1 + 2 * mmsize/2]
> + pmovsxwd m5, [r1 + 3 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + paddd m4, m1
> + paddd m5, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + psrad m4, xm0
> + psrad m5, xm0
> + movu [r0 + 0 * mmsize], m2
> + movu [r0 + 1 * mmsize], m3
> + movu [r0 + 2 * mmsize], m4
> + movu [r0 + 3 * mmsize], m5
> + add r0, 4 * mmsize
> +
> + ; Row 1
> + pmovsxwd m2, [r1 + r2 + 0 * mmsize/2]
> + pmovsxwd m3, [r1 + r2 + 1 * mmsize/2]
> + pmovsxwd m4, [r1 + r2 + 2 * mmsize/2]
> + pmovsxwd m5, [r1 + r2 + 3 * mmsize/2]
> + paddd m2, m1
> + paddd m3, m1
> + paddd m4, m1
> + paddd m5, m1
> + psrad m2, xm0
> + psrad m3, xm0
> + psrad m4, xm0
> + psrad m5, xm0
> + movu [r0 + 0 * mmsize], m2
> + movu [r0 + 1 * mmsize], m3
> + movu [r0 + 2 * mmsize], m4
> + movu [r0 + 3 * mmsize], m5
> + add r0, 4 * mmsize
> +
> + lea r1, [r1 + r2 * 2]
> + dec r3d
> + jnz .loop
> + RET
> +
> +
> ;--------------------------------------------------------------------------------------
> ; void convert32to16_shl(int16_t *dst, int32_t *src, intptr_t stride, int shift)
> ;--------------------------------------------------------------------------------------
> diff -r 77fe0cc583e8 -r 6cdcf1a7fa98 source/common/x86/blockcopy8.h
> --- a/source/common/x86/blockcopy8.h Wed Aug 27 14:25:17 2014 +0530
> +++ b/source/common/x86/blockcopy8.h Wed Aug 27 12:16:16 2014 -0700
> @@ -38,6 +38,10 @@
> void x265_cvt16to32_shr_8_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> void x265_cvt16to32_shr_16_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> void x265_cvt16to32_shr_32_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_4_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_8_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_16_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> +void x265_cvt16to32_shr_32_avx2(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
> uint32_t x265_cvt16to32_cnt_4_sse4(int32_t * dst, int16_t * src, intptr_t);
> uint32_t x265_cvt16to32_cnt_8_sse4(int32_t * dst, int16_t * src, intptr_t);
> uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list