[x265] [PATCH] idct8 sse2
dave
dtyx265 at gmail.com
Thu Nov 20 06:02:40 CET 2014
I made no changes to how the stack is used so that's all from gcc. I'll
work on it. Perhaps something like what idct8_ssse3 uses?
On 11/19/2014 08:46 PM, chen wrote:
> you never alloc stack but use it, eg: "[rsp - 72]"
>
> At 2014-11-20 11:48:01,dave <dtyx265 at gmail.com> wrote:
>
> How does it fail? Are you getting a segmentation fault? It works
> fine on debian/gcc but it is dependent on the stack being aligned
> at 8. I don't have any windows environment.
>
> I also replaced all the registers and mov instructions with the
> defines of x86inc.asm for x86_64. Perhaps I missed something that
> windows needs.
>
> On 11/19/2014 07:22 PM, chen wrote:
>> The code can't pass testbench on my PC
>> VS2008, Win7 x64
>> At 2014-11-20 11:13:31,dtyx265 at gmail.com wrote:
>> ># HG changeset patch
>> ># User David T Yuen<dtyx265 at gmail.com>
>> ># Date 1416451149 28800
>> ># Node ID 37392ba74268210aafa8123d9f7c12d46a22c152
>> ># Parent d059cfa88f1ac79b319bd8a05bc70704d454f0ba
>> >idct8 sse2
>> >
>> >Based on the gcc of Debian 4.7.2-5 by the following command
>> >
>> >c++ -S -masm=intel -DX265_ARCH_X86=1 -DX86_64=1 -DHAVE_INT_TYPES_H=1 -D__STDC_LIMIT_MACROS=1 -DHIGH_BIT_DEPTH=0 -O3 -DNDEBUG -I/home/shakezula/Development/x265/source/. -I/home/shakezula/Development/x265/source/Lib -I/home/shakezula/Development/x265/source/common -I/home/shakezula/Development/x265/source/encoder -I/home/shakezula/Development/x265/build/linux -Wall -Wextra -Wshadow -fPIC -ffast-math -mstackrealign -fno-exceptions -Wno-unused-parameter -msse3 -o ~/Development/dct-sse3.asm -c /home/shakezula/Development/x265/source/common/vec/dct-sse3.cpp
>> >
>> >It has been tweeked for better register usage with fewer values written to the stack and better setup of r2 indexing for write
>> >
>> >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/asm-primitives.cpp
>> >--- a/source/common/x86/asm-primitives.cpp Tue Nov 18 14:11:12 2014 -0600
>> >+++ b/source/common/x86/asm-primitives.cpp Wed Nov 19 18:39:09 2014 -0800
>> >@@ -1377,6 +1377,7 @@
>> > p.dct[DCT_4x4] = x265_dct4_sse2;
>> > p.idct[IDCT_4x4] = x265_idct4_sse2;
>> > p.idct[IDST_4x4] = x265_idst4_sse2;
>> >+ p.idct[IDCT_8x8] = x265_idct8_sse2;
>> >
>> > LUMA_SS_FILTERS(_sse2);
>> > }
>> >@@ -1567,6 +1568,7 @@
>> > p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
>> > p.dct[DCT_4x4] = x265_dct4_sse2;
>> > p.idct[IDCT_4x4] = x265_idct4_sse2;
>> >+ p.idct[IDCT_8x8] = x265_idct8_sse2;
>> > p.idct[IDST_4x4] = x265_idst4_sse2;
>> > p.planecopy_sp = x265_downShift_16_sse2;
>> > p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
>> >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.asm
>> >--- a/source/common/x86/dct8.asm Tue Nov 18 14:11:12 2014 -0600
>> >+++ b/source/common/x86/dct8.asm Wed Nov 19 18:39:09 2014 -0800
>> >@@ -302,6 +302,19 @@
>> >
>> > pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
>> >
>> >+tab_idct8: times 4 dw 89, 75
>> >+ times 4 dw 50, 18
>> >+ times 4 dw 75, -18
>> >+ times 4 dw -89, -50
>> >+ times 4 dw 50, -89
>> >+ times 4 dw 18, 75
>> >+ times 4 dw 18, -50
>> >+ times 4 dw 75, -89
>> >+ times 4 dw 64, 64
>> >+ times 4 dw 64, -64
>> >+ times 4 dw 83, 36
>> >+ times 4 dw 36, -83
>> >+
>> > SECTION .text
>> > cextern pd_1
>> > cextern pd_2
>> >@@ -976,6 +989,387 @@
>> > ;-------------------------------------------------------
>> > ; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
>> > ;-------------------------------------------------------
>> >+INIT_XMM sse2
>> >+
>> >+%if BIT_DEPTH == 10
>> >+ %define IDCT_SHIFT 10
>> >+ %define IDCT_ADD pd_512
>> >+%elif BIT_DEPTH == 8
>> >+ %define IDCT_SHIFT 12
>> >+ %define IDCT_ADD pd_2048
>> >+%else
>> >+ %error Unsupported BIT_DEPTH!
>> >+%endif
>> >+
>> >+cglobal idct8, 3,7, 16
>> >+ lea r2, [r2 + r2] ;set r2 to index of 1
>> >+ lea r4, [r2 + r2] ;set r4 to index of 2
>> >+ lea r3, [r4 + r2] ;set r3 to index of 3
>> >+ lea r4, [r4 + r3] ;set r4 to index of 5
>> >+ mova m9, [r0 + 32]
>> >+ packssdw m9, [r0 + 48]
>> >+ movu m1, [r0 + 96]
>> >+ packssdw m1, [r0 + 112]
>> >+ mova m7, m9
>> >+ punpcklwd m7, m1
>> >+ punpckhwd m9, m1
>> >+ mova m14, [tab_idct8]
>> >+ mova m3, m14
>> >+ pmaddwd m14, m7
>> >+ pmaddwd m3, m9
>> >+ mova m0, [r0 + 160]
>> >+ packssdw m0, [r0 + 176]
>> >+ mova m10, [r0 + 224]
>> >+ packssdw m10, [r0 + 240]
>> >+ mova m2, m0
>> >+ punpcklwd m2, m10
>> >+ punpckhwd m0, m10
>> >+ mova m15, [tab_idct8 + 16]
>> >+ mova m11, [tab_idct8 + 16]
>> >+ pmaddwd m15, m2
>> >+ mova m4, [tab_idct8 + 32]
>> >+ pmaddwd m11, m0
>> >+ mova m1, [tab_idct8 + 32]
>> >+ paddd m15, m14
>> >+ mova m5, [tab_idct8 + 64]
>> >+ mova m12, [tab_idct8 + 64]
>> >+ paddd m11, m3
>> >+ mova [rsp - 72], m11
>> >+ mova [rsp - 88], m15
>> >+ pmaddwd m4, m7
>> >+ pmaddwd m1, m9
>> >+ mova m14, [tab_idct8 + 48]
>> >+ mova m3, [tab_idct8 + 48]
>> >+ pmaddwd m14, m2
>> >+ pmaddwd m3, m0
>> >+ paddd m14, m4
>> >+ paddd m3, m1
>> >+ mova [rsp - 40], m3
>> >+ pmaddwd m5, m9
>> >+ pmaddwd m9, [tab_idct8 + 96]
>> >+ mova m6, [tab_idct8 + 80]
>> >+ pmaddwd m12, m7
>> >+ pmaddwd m7, [tab_idct8 + 96]
>> >+ mova m4, [tab_idct8 + 80]
>> >+ pmaddwd m6, m2
>> >+ paddd m6, m12
>> >+ pmaddwd m2, [tab_idct8 + 112]
>> >+ paddd m7, m2
>> >+ mova [rsp - 24], m6
>> >+ pmaddwd m4, m0
>> >+ pmaddwd m0, [tab_idct8 + 112]
>> >+ paddd m9, m0
>> >+ paddd m5, m4
>> >+ mova m6, [r0]
>> >+ packssdw m6, [r0 + 16]
>> >+ mova m0, [r0 + 128]
>> >+ packssdw m0, [r0 + 144]
>> >+ mova m4, m6
>> >+ mova m12, [r0 + 64]
>> >+ punpcklwd m4, m0
>> >+ punpckhwd m6, m0
>> >+ packssdw m12, [r0 + 80]
>> >+ mova m0, [r0 + 192]
>> >+ packssdw m0, [r0 + 208]
>> >+ mova m13, m12
>> >+ mova m8, [tab_idct8 + 128]
>> >+ punpcklwd m13, m0
>> >+ mova m10, [tab_idct8 + 128]
>> >+ punpckhwd m12, m0
>> >+ pmaddwd m8, m4
>> >+ mova m3, m8
>> >+ pmaddwd m4, [tab_idct8 + 144]
>> >+ pmaddwd m10, m6
>> >+ mova m2, [tab_idct8 + 160]
>> >+ mova m1, m10
>> >+ pmaddwd m6, [tab_idct8 + 144]
>> >+ mova m0, [tab_idct8 + 160]
>> >+ pmaddwd m2, m13
>> >+ paddd m3, m2
>> >+ psubd m8, m2
>> >+ mova m2, m6
>> >+ pmaddwd m13, [tab_idct8 + 176]
>> >+ pmaddwd m0, m12
>> >+ paddd m1, m0
>> >+ psubd m10, m0
>> >+ mova m0, m4
>> >+ pmaddwd m12, [tab_idct8 + 176]
>> >+ paddd m3, [pd_64]
>> >+ paddd m1, [pd_64]
>> >+ paddd m8, [pd_64]
>> >+ paddd m10, [pd_64]
>> >+ paddd m0, m13
>> >+ paddd m2, m12
>> >+ paddd m0, [pd_64]
>> >+ paddd m2, [pd_64]
>> >+ psubd m4, m13
>> >+ psubd m6, m12
>> >+ paddd m4, [pd_64]
>> >+ paddd m6, [pd_64]
>> >+ mova m12, m8
>> >+ psubd m8, m7
>> >+ psrad m8, 7
>> >+ paddd m15, m3
>> >+ psubd m3, [rsp - 88]
>> >+ psrad m15, 7
>> >+ paddd m12, m7
>> >+ psrad m12, 7
>> >+ paddd m11, m1
>> >+ mova m13, m14
>> >+ psrad m11, 7
>> >+ packssdw m15, m11
>> >+ psubd m1, [rsp - 72]
>> >+ psrad m1, 7
>> >+ mova m11, [rsp - 40]
>> >+ paddd m14, m0
>> >+ psrad m14, 7
>> >+ psubd m0, m13
>> >+ psrad m0, 7
>> >+ paddd m11, m2
>> >+ mova m13, [rsp - 24]
>> >+ psrad m11, 7
>> >+ packssdw m14, m11
>> >+ mova m11, m6
>> >+ psubd m6, m5
>> >+ paddd m13, m4
>> >+ psrad m13, 7
>> >+ psrad m6, 7
>> >+ paddd m11, m5
>> >+ psrad m11, 7
>> >+ packssdw m13, m11
>> >+ mova m11, m10
>> >+ psubd m4, [rsp - 24]
>> >+ psubd m10, m9
>> >+ psrad m4, 7
>> >+ psrad m10, 7
>> >+ packssdw m4, m6
>> >+ packssdw m8, m10
>> >+ paddd m11, m9
>> >+ psrad m11, 7
>> >+ packssdw m12, m11
>> >+ psubd m2, [rsp - 40]
>> >+ mova m5, m15
>> >+ psrad m2, 7
>> >+ packssdw m0, m2
>> >+ mova m2, m14
>> >+ psrad m3, 7
>> >+ packssdw m3, m1
>> >+ mova m6, m13
>> >+ punpcklwd m5, m8
>> >+ punpcklwd m2, m4
>> >+ mova m1, m12
>> >+ punpcklwd m6, m0
>> >+ punpcklwd m1, m3
>> >+ mova m9, m5
>> >+ punpckhwd m13, m0
>> >+ mova m0, m2
>> >+ punpcklwd m9, m6
>> >+ punpckhwd m5, m6
>> >+ punpcklwd m0, m1
>> >+ punpckhwd m2, m1
>> >+ punpckhwd m15, m8
>> >+ mova m1, m5
>> >+ punpckhwd m14, m4
>> >+ punpckhwd m12, m3
>> >+ mova m6, m9
>> >+ punpckhwd m9, m0
>> >+ punpcklwd m1, m2
>> >+ mova m4, [tab_idct8]
>> >+ punpckhwd m5, m2
>> >+ punpcklwd m6, m0
>> >+ mova m2, m15
>> >+ mova m0, m14
>> >+ mova m7, m9
>> >+ punpcklwd m2, m13
>> >+ punpcklwd m0, m12
>> >+ punpcklwd m7, m5
>> >+ punpckhwd m14, m12
>> >+ mova m10, m2
>> >+ punpckhwd m15, m13
>> >+ punpckhwd m9, m5
>> >+ pmaddwd m4, m7
>> >+ mova m13, m1
>> >+ punpckhwd m2, m0
>> >+ punpcklwd m10, m0
>> >+ mova m0, m15
>> >+ punpckhwd m15, m14
>> >+ mova m12, m1
>> >+ mova m3, [tab_idct8]
>> >+ punpcklwd m0, m14
>> >+ pmaddwd m3, m9
>> >+ mova m11, m2
>> >+ punpckhwd m2, m15
>> >+ punpcklwd m11, m15
>> >+ mova m8, [tab_idct8 + 16]
>> >+ punpcklwd m13, m0
>> >+ punpckhwd m12, m0
>> >+ pmaddwd m8, m11
>> >+ paddd m8, m4
>> >+ mova [rsp - 88], m8
>> >+ mova m4, [tab_idct8 + 32]
>> >+ pmaddwd m4, m7
>> >+ mova m15, [tab_idct8 + 32]
>> >+ mova m5, [tab_idct8 + 16]
>> >+ pmaddwd m15, m9
>> >+ pmaddwd m5, m2
>> >+ paddd m5, m3
>> >+ mova [rsp - 72], m5
>> >+ mova m14, [tab_idct8 + 48]
>> >+ mova m5, [tab_idct8 + 48]
>> >+ pmaddwd m14, m11
>> >+ paddd m14, m4
>> >+ mova [rsp - 56], m14
>> >+ pmaddwd m5, m2
>> >+ paddd m5, m15
>> >+ mova [rsp - 40], m5
>> >+ mova m15, [tab_idct8 + 64]
>> >+ mova m5, [tab_idct8 + 64]
>> >+ pmaddwd m15, m7
>> >+ pmaddwd m7, [tab_idct8 + 96]
>> >+ pmaddwd m5, m9
>> >+ pmaddwd m9, [tab_idct8 + 96]
>> >+ mova m4, [tab_idct8 + 80]
>> >+ pmaddwd m4, m2
>> >+ paddd m5, m4
>> >+ mova m4, m6
>> >+ mova m8, [tab_idct8 + 80]
>> >+ punpckhwd m6, m10
>> >+ pmaddwd m2, [tab_idct8 + 112]
>> >+ punpcklwd m4, m10
>> >+ paddd m9, m2
>> >+ pmaddwd m8, m11
>> >+ mova m10, [tab_idct8 + 128]
>> >+ paddd m8, m15
>> >+ pmaddwd m11, [tab_idct8 + 112]
>> >+ paddd m7, m11
>> >+ mova [rsp - 24], m8
>> >+ pmaddwd m10, m6
>> >+ pmaddwd m6, [tab_idct8 + 144]
>> >+ mova m1, m10
>> >+ mova m8, [tab_idct8 + 128]
>> >+ mova m3, [tab_idct8 + 160]
>> >+ pmaddwd m8, m4
>> >+ pmaddwd m4, [tab_idct8 + 144]
>> >+ mova m0, m8
>> >+ mova m2, [tab_idct8 + 160]
>> >+ pmaddwd m3, m13
>> >+ psubd m8, m3
>> >+ paddd m0, m3
>> >+ mova m3, m6
>> >+ pmaddwd m13, [tab_idct8 + 176]
>> >+ pmaddwd m2, m12
>> >+ paddd m1, m2
>> >+ psubd m10, m2
>> >+ mova m2, m4
>> >+ pmaddwd m12, [tab_idct8 + 176]
>> >+ paddd m0, [IDCT_ADD]
>> >+ paddd m1, [IDCT_ADD]
>> >+ paddd m8, [IDCT_ADD]
>> >+ paddd m10, [IDCT_ADD]
>> >+ paddd m2, m13
>> >+ paddd m3, m12
>> >+ paddd m2, [IDCT_ADD]
>> >+ paddd m3, [IDCT_ADD]
>> >+ psubd m4, m13
>> >+ psubd m6, m12
>> >+ paddd m4, [IDCT_ADD]
>> >+ paddd m6, [IDCT_ADD]
>> >+ mova m15, [rsp - 88]
>> >+ mova m12, m8
>> >+ psubd m8, m7
>> >+ psrad m8, IDCT_SHIFT
>> >+ mova m11, [rsp - 72]
>> >+ paddd m15, m0
>> >+ psrad m15, IDCT_SHIFT
>> >+ psubd m0, [rsp - 88]
>> >+ psrad m0, IDCT_SHIFT
>> >+ paddd m12, m7
>> >+ paddd m11, m1
>> >+ mova m14, [rsp - 56]
>> >+ psrad m11, IDCT_SHIFT
>> >+ packssdw m15, m11
>> >+ psubd m1, [rsp - 72]
>> >+ psrad m1, IDCT_SHIFT
>> >+ mova m11, [rsp - 40]
>> >+ paddd m14, m2
>> >+ psrad m14, IDCT_SHIFT
>> >+ packssdw m0, m1
>> >+ psrad m12, IDCT_SHIFT
>> >+ psubd m2, [rsp - 56]
>> >+ paddd m11, m3
>> >+ mova m13, [rsp - 24]
>> >+ psrad m11, IDCT_SHIFT
>> >+ packssdw m14, m11
>> >+ mova m11, m6
>> >+ psubd m6, m5
>> >+ paddd m13, m4
>> >+ psrad m13, IDCT_SHIFT
>> >+ mova m1, m15
>> >+ paddd m11, m5
>> >+ psrad m11, IDCT_SHIFT
>> >+ packssdw m13, m11
>> >+ mova m11, m10
>> >+ psubd m10, m9
>> >+ psrad m10, IDCT_SHIFT
>> >+ packssdw m8, m10
>> >+ psrad m6, IDCT_SHIFT
>> >+ psubd m4, [rsp - 24]
>> >+ paddd m11, m9
>> >+ psrad m11, IDCT_SHIFT
>> >+ packssdw m12, m11
>> >+ punpcklwd m1, m14
>> >+ mova m5, m13
>> >+ psrad m4, IDCT_SHIFT
>> >+ packssdw m4, m6
>> >+ psubd m3, [rsp - 40]
>> >+ psrad m2, IDCT_SHIFT
>> >+ mova m6, m8
>> >+ psrad m3, IDCT_SHIFT
>> >+ punpcklwd m5, m12
>> >+ packssdw m2, m3
>> >+ punpcklwd m6, m4
>> >+ punpckhwd m8, m4
>> >+ mova m4, m1
>> >+ mova m3, m2
>> >+ punpckhdq m1, m5
>> >+ punpckldq m4, m5
>> >+ punpcklwd m3, m0
>> >+ punpckhwd m2, m0
>> >+ mova m0, m6
>> >+ lea r0, [r4 + r2 * 2] ;set r0 to index of 7
>> >+ movq [r1], m4
>> >+ punpckhwd m15, m14
>> >+ movhps [r1 + r2], m4
>> >+ punpckhdq m0, m3
>> >+ movq [r1 + r2 * 2], m1
>> >+ punpckhwd m13, m12
>> >+ movhps [r1 + r3], m1
>> >+ mova m1, m6
>> >+ punpckldq m1, m3
>> >+ movq [r1 + 8], m1
>> >+ movhps [r1 + r2 + 8], m1
>> >+ movq [r1 + r2 * 2 + 8], m0
>> >+ movhps [r1 + r3 + 8], m0
>> >+ mova m0, m15
>> >+ punpckhdq m15, m13
>> >+ punpckldq m0, m13
>> >+ movq [r1 + r2 * 4], m0
>> >+ movhps [r1 + r4], m0
>> >+ mova m0, m8
>> >+ punpckhdq m8, m2
>> >+ movq [r1 + r3 * 2], m15
>> >+ punpckldq m0, m2
>> >+ movhps [r1 + r0], m15
>> >+ movq [r1 + r2 * 4 + 8], m0
>> >+ movhps [r1 + r4 + 8], m0
>> >+ movq [r1 + r3 * 2 + 8], m8
>> >+ movhps [r1 + r0 + 8], m8
>> >+ RET
>> >+%undef IDCT_SHIFT
>> >+
>> >+;-------------------------------------------------------
>> >+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
>> >+;-------------------------------------------------------
>> > INIT_XMM ssse3
>> >
>> > cglobal patial_butterfly_inverse_internal_pass1
>> >diff -r d059cfa88f1a -r 37392ba74268 source/common/x86/dct8.h
>> >--- a/source/common/x86/dct8.h Tue Nov 18 14:11:12 2014 -0600
>> >+++ b/source/common/x86/dct8.h Wed Nov 19 18:39:09 2014 -0800
>> >@@ -35,6 +35,7 @@
>> > void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>> > void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>> > void x265_idct4_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>> >+void x265_idct8_sse2(int32_t *src, int16_t *dst, intptr_t stride);
>> > void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
>> > void x265_idct8_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>> > void x265_idct16_avx2(int32_t *src, int16_t *dst, intptr_t stride);
>> >_______________________________________________
>> >x265-devel mailing list
>> >x265-devel at videolan.org
>> >https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141119/75fa140b/attachment-0001.html>
More information about the x265-devel
mailing list