[x265] [PATCH] asm: dct8 sse2 1.88x improvement over c code
dave
dtyx265 at gmail.com
Fri Feb 20 17:06:10 CET 2015
On 02/19/2015 04:58 PM, Steve Borho wrote:
> On 02/19, dtyx265 at gmail.com wrote:
>> # HG changeset patch
>> # User David T Yuen <dtyx265 at gmail.com>
>> # Date 1424385856 28800
>> # Node ID 28287b57013e9c43488bfba1570ded5cfb4af16d
>> # Parent 039ea966d5ebccab1de2c3766fb7b4f125d2020a
>> asm: dct8 sse2 1.88x improvement over c code
>>
>> This is backported from dct8 sse4
> it would be helpful for reviewers if you could tell us which CPUs this
> will help and by how much.
This is intended for cpus that support up to sse3 and until an ssse3
dct8 primitive is developed, also ssse3. The following is from testing
on my sse3 system
dct8x8 1.88x 11202.50 21044.87
> If your CPU does not have SSE4 would the
> encoder use the C reference if this primitive were not present?
Yes, this replaces the C reference on systems with sse2 to ssse3.
>
>> diff -r 039ea966d5eb -r 28287b57013e source/common/x86/asm-primitives.cpp
>> --- a/source/common/x86/asm-primitives.cpp Wed Feb 18 19:04:02 2015 -0600
>> +++ b/source/common/x86/asm-primitives.cpp Thu Feb 19 14:44:16 2015 -0800
>> @@ -872,6 +872,7 @@
>> p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixelcmp_t)x265_pixel_ssd_ss_32x64_sse2;
>>
>> p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
>> + p.cu[BLOCK_8x8].dct = x265_dct8_sse2;
>> p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
>> #if X86_64
>> p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
>> @@ -1080,6 +1081,7 @@
>> p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
>>
>> p.cu[BLOCK_4x4].dct = x265_dct4_sse2;
>> + p.cu[BLOCK_8x8].dct = x265_dct8_sse2;
>> p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
>> #if X86_64
>> p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
>> diff -r 039ea966d5eb -r 28287b57013e source/common/x86/dct8.asm
>> --- a/source/common/x86/dct8.asm Wed Feb 18 19:04:02 2015 -0600
>> +++ b/source/common/x86/dct8.asm Thu Feb 19 14:44:16 2015 -0800
>> @@ -748,6 +748,368 @@
>> movhps [r1 + r2], m1
>> RET
>>
>> +;-------------------------------------------------------
>> +; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
>> +;-------------------------------------------------------
>> +INIT_XMM sse2
>> +cglobal dct8, 3,6,8,0-16*mmsize
>> + ;------------------------
>> + ; Stack Mapping(dword)
>> + ;------------------------
>> + ; Row0[0-3] Row1[0-3]
>> + ; ...
>> + ; Row6[0-3] Row7[0-3]
>> + ; Row0[0-3] Row7[0-3]
>> + ; ...
>> + ; Row6[4-7] Row7[4-7]
>> + ;------------------------
>> +%if BIT_DEPTH == 10
>> + %define DCT_SHIFT1 4
>> + %define DCT_ADD1 [pd_8]
>> +%elif BIT_DEPTH == 8
>> + %define DCT_SHIFT1 2
>> + %define DCT_ADD1 [pd_2]
>> +%else
>> + %error Unsupported BIT_DEPTH!
>> +%endif
>> +%define DCT_ADD2 [pd_256]
>> +%define DCT_SHIFT2 9
>> +
>> + add r2, r2
>> + lea r3, [r2 * 3]
>> + mov r5, rsp
>> +%assign x 0
>> +%rep 2
>> + movu m0, [r0]
>> + movu m1, [r0 + r2]
>> + movu m2, [r0 + r2 * 2]
>> + movu m3, [r0 + r3]
>> +
>> + punpcklwd m4, m0, m1
>> + punpckhwd m0, m1
>> + punpcklwd m5, m2, m3
>> + punpckhwd m2, m3
>> + punpckldq m1, m4, m5 ; m1 = [1 0]
>> + punpckhdq m4, m5 ; m4 = [3 2]
>> + punpckldq m3, m0, m2
>> + punpckhdq m0, m2
>> + pshufd m2, m3, 0x4E ; m2 = [4 5]
>> + pshufd m0, m0, 0x4E ; m0 = [6 7]
>> +
>> + paddw m3, m1, m0
>> + psubw m1, m0 ; m1 = [d1 d0]
>> + paddw m0, m4, m2
>> + psubw m4, m2 ; m4 = [d3 d2]
>> + punpcklqdq m2, m3, m0 ; m2 = [s2 s0]
>> + punpckhqdq m3, m0
>> + pshufd m3, m3, 0x4E ; m3 = [s1 s3]
>> +
>> + punpcklwd m0, m1, m4 ; m0 = [d2/d0]
>> + punpckhwd m1, m4 ; m1 = [d3/d1]
>> + punpckldq m4, m0, m1 ; m4 = [d3 d1 d2 d0]
>> + punpckhdq m0, m1 ; m0 = [d3 d1 d2 d0]
>> +
>> + ; odd
>> + lea r4, [tab_dct8_1]
>> + pmaddwd m1, m4, [r4 + 0*16]
>> + pmaddwd m5, m0, [r4 + 0*16]
>> + pshufd m1, m1, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m1
>> + punpckhqdq m7, m5
>> + punpcklqdq m1, m5
>> + paddd m1, m7
>> + paddd m1, DCT_ADD1
>> + psrad m1, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m1, m1, 0x1B
>> + %endif
>> + mova [r5 + 1*2*mmsize], m1 ; Row 1
>> +
>> + pmaddwd m1, m4, [r4 + 1*16]
>> + pmaddwd m5, m0, [r4 + 1*16]
>> + pshufd m1, m1, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m1
>> + punpckhqdq m7, m5
>> + punpcklqdq m1, m5
>> + paddd m1, m7
>> + paddd m1, DCT_ADD1
>> + psrad m1, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m1, m1, 0x1B
>> + %endif
>> + mova [r5 + 3*2*mmsize], m1 ; Row 3
>> +
>> + pmaddwd m1, m4, [r4 + 2*16]
>> + pmaddwd m5, m0, [r4 + 2*16]
>> + pshufd m1, m1, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m1
>> + punpckhqdq m7, m5
>> + punpcklqdq m1, m5
>> + paddd m1, m7
>> + paddd m1, DCT_ADD1
>> + psrad m1, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m1, m1, 0x1B
>> + %endif
>> + mova [r5 + 5*2*mmsize], m1 ; Row 5
>> +
>> + pmaddwd m4, [r4 + 3*16]
>> + pmaddwd m0, [r4 + 3*16]
>> + pshufd m4, m4, 0xD8
>> + pshufd m0, m0, 0xD8
>> + mova m7, m4
>> + punpckhqdq m7, m0
>> + punpcklqdq m4, m0
>> + paddd m4, m7
>> + paddd m4, DCT_ADD1
>> + psrad m4, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m4, m4, 0x1B
>> + %endif
>> + mova [r5 + 7*2*mmsize], m4; Row 7
>> +
>> + ; even
>> + lea r4, [tab_dct4]
>> + paddw m0, m2, m3 ; m0 = [EE1 EE0]
>> + pshufd m0, m0, 0xD8
>> + pshuflw m0, m0, 0xD8
>> + pshufhw m0, m0, 0xD8
>> + psubw m2, m3 ; m2 = [EO1 EO0]
>> + pmullw m2, [pw_ppppmmmm]
>> + pshufd m2, m2, 0xD8
>> + pshuflw m2, m2, 0xD8
>> + pshufhw m2, m2, 0xD8
>> + pmaddwd m3, m0, [r4 + 0*16]
>> + paddd m3, DCT_ADD1
>> + psrad m3, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m3, m3, 0x1B
>> + %endif
>> + mova [r5 + 0*2*mmsize], m3 ; Row 0
>> + pmaddwd m0, [r4 + 2*16]
>> + paddd m0, DCT_ADD1
>> + psrad m0, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m0, m0, 0x1B
>> + %endif
>> + mova [r5 + 4*2*mmsize], m0 ; Row 4
>> + pmaddwd m3, m2, [r4 + 1*16]
>> + paddd m3, DCT_ADD1
>> + psrad m3, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m3, m3, 0x1B
>> + %endif
>> + mova [r5 + 2*2*mmsize], m3 ; Row 2
>> + pmaddwd m2, [r4 + 3*16]
>> + paddd m2, DCT_ADD1
>> + psrad m2, DCT_SHIFT1
>> + %if x == 1
>> + pshufd m2, m2, 0x1B
>> + %endif
>> + mova [r5 + 6*2*mmsize], m2 ; Row 6
>> +
>> + %if x != 1
>> + lea r0, [r0 + r2 * 4]
>> + add r5, mmsize
>> + %endif
>> +%assign x x+1
>> +%endrep
>> +
>> + mov r0, rsp ; r0 = pointer to Low Part
>> + lea r4, [tab_dct8_2]
>> +
>> +%assign x 0
>> +%rep 4
>> + mova m0, [r0 + 0*2*mmsize] ; [3 2 1 0]
>> + mova m1, [r0 + 1*2*mmsize]
>> + paddd m2, m0, [r0 + (0*2+1)*mmsize]
>> + pshufd m2, m2, 0x9C ; m2 = [s2 s1 s3 s0]
>> + paddd m3, m1, [r0 + (1*2+1)*mmsize]
>> + pshufd m3, m3, 0x9C ; m3 = ^^
>> + psubd m0, [r0 + (0*2+1)*mmsize] ; m0 = [d3 d2 d1 d0]
>> + psubd m1, [r0 + (1*2+1)*mmsize] ; m1 = ^^
>> +
>> + ; even
>> + pshufd m4, m2, 0xD8
>> + pshufd m3, m3, 0xD8
>> + mova m7, m4
>> + punpckhqdq m7, m3
>> + punpcklqdq m4, m3
>> + mova m2, m4
>> + paddd m4, m7 ; m4 = [EE1 EE0 EE1 EE0]
>> + psubd m2, m7 ; m2 = [EO1 EO0 EO1 EO0]
>> +
>> + pslld m4, 6 ; m4 = [64*EE1 64*EE0]
>> + mova m5, m2
>> + pmuludq m5, [r4 + 0*16]
>> + pshufd m7, m2, 0xF5
>> + movu m6, [r4 + 0*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m5, m5, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m5, m7 ; m5 = [36*EO1 83*EO0]
>> + pshufd m7, m2, 0xF5
>> + pmuludq m2, [r4 + 1*16]
>> + movu m6, [r4 + 1*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m2, m2, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m2, m7 ; m2 = [83*EO1 36*EO0]
>> +
>> + pshufd m3, m4, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m3
>> + punpckhqdq m7, m5
>> + punpcklqdq m3, m5
>> + paddd m3, m7 ; m3 = [Row2 Row0]
>> + paddd m3, DCT_ADD2
>> + psrad m3, DCT_SHIFT2
>> + pshufd m4, m4, 0xD8
>> + pshufd m2, m2, 0xD8
>> + mova m7, m4
>> + punpckhqdq m7, m2
>> + punpcklqdq m4, m2
>> + psubd m4, m7 ; m4 = [Row6 Row4]
>> + paddd m4, DCT_ADD2
>> + psrad m4, DCT_SHIFT2
>> +
>> + packssdw m3, m3
>> + movd [r1 + 0*mmsize], m3
>> + pshufd m3, m3, 1
>> + movd [r1 + 2*mmsize], m3
>> +
>> + packssdw m4, m4
>> + movd [r1 + 4*mmsize], m4
>> + pshufd m4, m4, 1
>> + movd [r1 + 6*mmsize], m4
>> +
>> + ; odd
>> + mova m2, m0
>> + pmuludq m2, [r4 + 2*16]
>> + pshufd m7, m0, 0xF5
>> + movu m6, [r4 + 2*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m2, m2, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m2, m7
>> + mova m3, m1
>> + pmuludq m3, [r4 + 2*16]
>> + pshufd m7, m1, 0xF5
>> + pmuludq m7, m6
>> + pshufd m3, m3, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m3, m7
>> + mova m4, m0
>> + pmuludq m4, [r4 + 3*16]
>> + pshufd m7, m0, 0xF5
>> + movu m6, [r4 + 3*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m4, m4, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m4, m7
>> + mova m5, m1
>> + pmuludq m5, [r4 + 3*16]
>> + pshufd m7, m1, 0xF5
>> + pmuludq m7, m6
>> + pshufd m5, m5, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m5, m7
>> + pshufd m2, m2, 0xD8
>> + pshufd m3, m3, 0xD8
>> + mova m7, m2
>> + punpckhqdq m7, m3
>> + punpcklqdq m2, m3
>> + paddd m2, m7
>> + pshufd m4, m4, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m4
>> + punpckhqdq m7, m5
>> + punpcklqdq m4, m5
>> + paddd m4, m7
>> + pshufd m2, m2, 0xD8
>> + pshufd m4, m4, 0xD8
>> + mova m7, m2
>> + punpckhqdq m7, m4
>> + punpcklqdq m2, m4
>> + paddd m2, m7 ; m2 = [Row3 Row1]
>> + paddd m2, DCT_ADD2
>> + psrad m2, DCT_SHIFT2
>> +
>> + packssdw m2, m2
>> + movd [r1 + 1*mmsize], m2
>> + pshufd m2, m2, 1
>> + movd [r1 + 3*mmsize], m2
>> +
>> + mova m2, m0
>> + pmuludq m2, [r4 + 4*16]
>> + pshufd m7, m0, 0xF5
>> + movu m6, [r4 + 4*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m2, m2, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m2, m7
>> + mova m3, m1
>> + pmuludq m3, [r4 + 4*16]
>> + pshufd m7, m1, 0xF5
>> + pmuludq m7, m6
>> + pshufd m3, m3, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m3, m7
>> + mova m4, m0
>> + pmuludq m4, [r4 + 5*16]
>> + pshufd m7, m0, 0xF5
>> + movu m6, [r4 + 5*16 + 4]
>> + pmuludq m7, m6
>> + pshufd m4, m4, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m4, m7
>> + mova m5, m1
>> + pmuludq m5, [r4 + 5*16]
>> + pshufd m7, m1, 0xF5
>> + pmuludq m7, m6
>> + pshufd m5, m5, 0x88
>> + pshufd m7, m7, 0x88
>> + punpckldq m5, m7
>> + pshufd m2, m2, 0xD8
>> + pshufd m3, m3, 0xD8
>> + mova m7, m2
>> + punpckhqdq m7, m3
>> + punpcklqdq m2, m3
>> + paddd m2, m7
>> + pshufd m4, m4, 0xD8
>> + pshufd m5, m5, 0xD8
>> + mova m7, m4
>> + punpckhqdq m7, m5
>> + punpcklqdq m4, m5
>> + paddd m4, m7
>> + pshufd m2, m2, 0xD8
>> + pshufd m4, m4, 0xD8
>> + mova m7, m2
>> + punpckhqdq m7, m4
>> + punpcklqdq m2, m4
>> + paddd m2, m7 ; m2 = [Row7 Row5]
>> + paddd m2, DCT_ADD2
>> + psrad m2, DCT_SHIFT2
>> +
>> + packssdw m2, m2
>> + movd [r1 + 5*mmsize], m2
>> + pshufd m2, m2, 1
>> + movd [r1 + 7*mmsize], m2
>> +%if x < 3
>> + add r1, mmsize/4
>> + add r0, 2*2*mmsize
>> +%endif
>> +%assign x x+1
>> +%endrep
>> +
>> + RET
>> +%undef IDCT_SHIFT1
>> +%undef IDCT_ADD1
>> +%undef IDCT_SHIFT2
>> +%undef IDCT_ADD2
>>
>> ;-------------------------------------------------------
>> ; void dct8(const int16_t* src, int16_t* dst, intptr_t srcStride)
>> diff -r 039ea966d5eb -r 28287b57013e source/common/x86/dct8.h
>> --- a/source/common/x86/dct8.h Wed Feb 18 19:04:02 2015 -0600
>> +++ b/source/common/x86/dct8.h Thu Feb 19 14:44:16 2015 -0800
>> @@ -24,6 +24,7 @@
>> #ifndef X265_DCT8_H
>> #define X265_DCT8_H
>> void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>> +void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>> void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
>> void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
>> void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
More information about the x265-devel
mailing list