<html>
<head>
<meta content="text/html; charset=windows-1252"
http-equiv="Content-Type">
</head>
<body text="#000000" bgcolor="#FFFFFF">
<div class="moz-cite-prefix">Sorry for the slow reply, I ran into
some unrelated technical difficulties...<br>
<br>
On 12/09/2014 09:56 AM, chen wrote:<br>
</div>
<blockquote
cite="mid:1bd46af6.fa8.14a3031ba0c.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<div> </div>
<pre>
At 2014-12-09 12:21:22,<a class="moz-txt-link-abbreviated" href="mailto:dtyx265@gmail.com">dtyx265@gmail.com</a> wrote:
># HG changeset patch
># User David T Yuen <a moz-do-not-send="true" href="mailto:dtyx265@gmail.com%3E%3E#%A0Date%A01418098810%A028800%3E#%A0Node%A0ID%A039dfcbf07ae468ca9090e2dabb350cc193060229%3E#%A0Parent%A0%A053f7efef5ebda6d5ff03e868f2b702c385d72ddd%3Easm:%A0idct[8x8]%A0sse2%A012232%A0-%3E%A03500%A0over%A0c%A0code%A0%A03550%A0-%3E%A03500%A0over%A0intrinsic%3E">dtyx265@gmail.com>
># Date 1418098810 28800
># Node ID 39dfcbf07ae468ca9090e2dabb350cc193060229
># Parent 53f7efef5ebda6d5ff03e868f2b702c385d72ddd
>asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 3500 over intrinsic
>
</a>--- a/source/common/x86/dct8.asm Sat Dec 06 17:17:59 2014 +0900
>+++ b/source/common/x86/dct8.asm Mon Dec 08 20:20:10 2014 -0800
>@@ -974,6 +974,386 @@
> RET
>
> ;-------------------------------------------------------
>+; void idct8(int32_t* src, int16_t* dst, intptr_t dstStride)
>+;-------------------------------------------------------
>+INIT_XMM sse2
>+%if ARCH_X86_64
>+%if BIT_DEPTH == 10
>+ %define IDCT_SHIFT 10
>+ %define IDCT_ADD pd_512
>+%elif BIT_DEPTH == 8
>+ %define IDCT_SHIFT 12
>+ %define IDCT_ADD pd_2048
>+%else
>+ %error Unsupported BIT_DEPTH!
>+%endif
>+
>+cglobal idct8, 3,6, 16
</pre>
<pre>cglobal fourth parameter is stack space</pre>
</div>
</blockquote>
Understood....<br>
<blockquote
cite="mid:1bd46af6.fa8.14a3031ba0c.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre> </pre>
<pre>>+ mov r5, rsp
>+ sub rsp, 5 * mmsize
>+ and rsp, ~(16-1)
>+ mova m9, [r0 + 1 * mmsize]
>+ mova m1, [r0 + 3 * mmsize]
>+ mova m7, m9
>+ punpcklwd m7, m1
>+ punpckhwd m9, m1
>+ mova m14, [tab_idct8_3]
>+ mova m3, m14
>+ pmaddwd m14, m7
>+ pmaddwd m3, m9
>+ mova m0, [r0 + 5 * mmsize]
>+ mova m10, [r0 + 7 * mmsize]
>+ mova m2, m0
>+ punpcklwd m2, m10
>+ punpckhwd m0, m10
>+ mova m15, [tab_idct8_3 + 1 * mmsize]
>+ mova m11, [tab_idct8_3 + 1 * mmsize]
>+ pmaddwd m15, m2
>+ mova m4, [tab_idct8_3 + 2 * mmsize]
>+ pmaddwd m11, m0
>+ mova m1, [tab_idct8_3 + 2 * mmsize]
>+ paddd m15, m14
>+ mova m5, [tab_idct8_3 + 4 * mmsize]
>+ mova m12, [tab_idct8_3 + 4 * mmsize]
>+ paddd m11, m3
>+ mova [rsp + 0 * mmsize], m11
>+ mova [rsp + 1 * mmsize], m15
>+ pmaddwd m4, m7
>+ pmaddwd m1, m9
>+ mova m14, [tab_idct8_3 + 3 * mmsize]
>+ mova m3, [tab_idct8_3 + 3 * mmsize]
>+ pmaddwd m14, m2
>+ pmaddwd m3, m0
>+ paddd m14, m4
>+ paddd m3, m1
>+ mova [rsp + 2 * mmsize], m3
>+ pmaddwd m5, m9
>+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
>+ mova m6, [tab_idct8_3 + 5 * mmsize]
>+ pmaddwd m12, m7
>+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
>+ mova m4, [tab_idct8_3 + 5 * mmsize]
>+ pmaddwd m6, m2
>+ paddd m6, m12
>+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
>+ paddd m7, m2
</pre>
<pre>we can reduce register number, here is a example, m2 is free now</pre>
<pre>>+ mova [rsp + 3 * mmsize], m6
>+ pmaddwd m4, m0
>+ pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
>+ paddd m9, m0
>+ paddd m5, m4
>+ mova m6, [r0 + 0 * mmsize]
</pre>
<pre>m6 can replace by m2</pre>
</div>
</blockquote>
I don't see how to get this to work. m2 and m6 are reused and I
don't see any other free registers to replace them with. I have
already made as many improvements of the type you are suggesting as
I could find.<br>
<blockquote
cite="mid:1bd46af6.fa8.14a3031ba0c.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre> </pre>
<pre>>+ mova m0, [r0 + 4 * mmsize]
>+ mova m4, m6
>+ punpcklwd m4, m0
>+ punpckhwd m6, m0
>+ mova m12, [r0 + 2 * mmsize]
>+ mova m0, [r0 + 6 * mmsize]
>+ mova m13, m12
>+ mova m8, [tab_dct4]
>+ punpcklwd m13, m0
>+ mova m10, [tab_dct4]
>+ punpckhwd m12, m0
>+ pmaddwd m8, m4
>+ mova m3, m8
>+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
>+ pmaddwd m10, m6
>+ mova m2, [tab_dct4 + 1 * mmsize]
>+ mova m1, m10
>+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
>+ mova m0, [tab_dct4 + 1 * mmsize]
>+ pmaddwd m2, m13
>+ paddd m3, m2
>+ psubd m8, m2
>+ mova m2, m6
>+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
>+ pmaddwd m0, m12
>+ paddd m1, m0
>+ psubd m10, m0
>+ mova m0, m4
>+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
>+ paddd m3, [pd_64]
>+ paddd m1, [pd_64]
>+ paddd m8, [pd_64]
>+ paddd m10, [pd_64]
>+ paddd m0, m13
>+ paddd m2, m12
>+ paddd m0, [pd_64]
>+ paddd m2, [pd_64]
>+ psubd m4, m13
>+ psubd m6, m12
>+ paddd m4, [pd_64]
>+ paddd m6, [pd_64]
>+ mova m12, m8
>+ psubd m8, m7
>+ psrad m8, 7
>+ paddd m15, m3
>+ psubd m3, [rsp + 1 * mmsize]
>+ psrad m15, 7
>+ paddd m12, m7
>+ psrad m12, 7
>+ paddd m11, m1
>+ mova m13, m14
>+ psrad m11, 7
>+ packssdw m15, m11
>+ psubd m1, [rsp + 0 * mmsize]
>+ psrad m1, 7
>+ mova m11, [rsp + 2 * mmsize]
>+ paddd m14, m0
>+ psrad m14, 7
>+ psubd m0, m13
>+ psrad m0, 7
>+ paddd m11, m2
>+ mova m13, [rsp + 3 * mmsize]
>+ psrad m11, 7
>+ packssdw m14, m11
>+ mova m11, m6
>+ psubd m6, m5
>+ paddd m13, m4
>+ psrad m13, 7
>+ psrad m6, 7
>+ paddd m11, m5
>+ psrad m11, 7
>+ packssdw m13, m11
>+ mova m11, m10
>+ psubd m4, [rsp + 3 * mmsize]
>+ psubd m10, m9
>+ psrad m4, 7
>+ psrad m10, 7
>+ packssdw m4, m6
>+ packssdw m8, m10
>+ paddd m11, m9
>+ psrad m11, 7
>+ packssdw m12, m11
>+ psubd m2, [rsp + 2 * mmsize]
>+ mova m5, m15
>+ psrad m2, 7
>+ packssdw m0, m2
>+ mova m2, m14
>+ psrad m3, 7
>+ packssdw m3, m1
>+ mova m6, m13
>+ punpcklwd m5, m8
>+ punpcklwd m2, m4
>+ mova m1, m12
>+ punpcklwd m6, m0
>+ punpcklwd m1, m3
>+ mova m9, m5
>+ punpckhwd m13, m0
>+ mova m0, m2
>+ punpcklwd m9, m6
>+ punpckhwd m5, m6
>+ punpcklwd m0, m1
>+ punpckhwd m2, m1
>+ punpckhwd m15, m8
>+ mova m1, m5
>+ punpckhwd m14, m4
>+ punpckhwd m12, m3
>+ mova m6, m9
>+ punpckhwd m9, m0
>+ punpcklwd m1, m2
>+ mova m4, [tab_idct8_3 + 0 * mmsize]
>+ punpckhwd m5, m2
>+ punpcklwd m6, m0
>+ mova m2, m15
>+ mova m0, m14
>+ mova m7, m9
>+ punpcklwd m2, m13
>+ punpcklwd m0, m12
>+ punpcklwd m7, m5
>+ punpckhwd m14, m12
>+ mova m10, m2
>+ punpckhwd m15, m13
>+ punpckhwd m9, m5
>+ pmaddwd m4, m7
>+ mova m13, m1
>+ punpckhwd m2, m0
>+ punpcklwd m10, m0
>+ mova m0, m15
>+ punpckhwd m15, m14
>+ mova m12, m1
>+ mova m3, [tab_idct8_3 + 0 * mmsize]
>+ punpcklwd m0, m14
>+ pmaddwd m3, m9
>+ mova m11, m2
>+ punpckhwd m2, m15
>+ punpcklwd m11, m15
>+ mova m8, [tab_idct8_3 + 1 * mmsize]
>+ punpcklwd m13, m0
>+ punpckhwd m12, m0
>+ pmaddwd m8, m11
>+ paddd m8, m4
>+ mova [rsp + 4 * mmsize], m8
>+ mova m4, [tab_idct8_3 + 2 * mmsize]
>+ pmaddwd m4, m7
>+ mova m15, [tab_idct8_3 + 2 * mmsize]
>+ mova m5, [tab_idct8_3 + 1 * mmsize]
>+ pmaddwd m15, m9
>+ pmaddwd m5, m2
>+ paddd m5, m3
>+ mova [rsp + 3 * mmsize], m5
>+ mova m14, [tab_idct8_3 + 3 * mmsize]
>+ mova m5, [tab_idct8_3 + 3 * mmsize]
>+ pmaddwd m14, m11
>+ paddd m14, m4
>+ mova [rsp + 2 * mmsize], m14
>+ pmaddwd m5, m2
>+ paddd m5, m15
>+ mova [rsp + 1 * mmsize], m5
>+ mova m15, [tab_idct8_3 + 4 * mmsize]
>+ mova m5, [tab_idct8_3 + 4 * mmsize]
>+ pmaddwd m15, m7
>+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
>+ pmaddwd m5, m9
>+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
>+ mova m4, [tab_idct8_3 + 5 * mmsize]
>+ pmaddwd m4, m2
>+ paddd m5, m4
>+ mova m4, m6
>+ mova m8, [tab_idct8_3 + 5 * mmsize]
>+ punpckhwd m6, m10
>+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
>+ punpcklwd m4, m10
>+ paddd m9, m2
>+ pmaddwd m8, m11
>+ mova m10, [tab_dct4]
>+ paddd m8, m15
>+ pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
>+ paddd m7, m11
>+ mova [rsp + 0 * mmsize], m8
>+ pmaddwd m10, m6
>+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
>+ mova m1, m10
>+ mova m8, [tab_dct4]
>+ mova m3, [tab_dct4 + 1 * mmsize]
>+ pmaddwd m8, m4
>+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
>+ mova m0, m8
>+ mova m2, [tab_dct4 + 1 * mmsize]
>+ pmaddwd m3, m13
>+ psubd m8, m3
>+ paddd m0, m3
>+ mova m3, m6
>+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
>+ pmaddwd m2, m12
>+ paddd m1, m2
>+ psubd m10, m2
>+ mova m2, m4
>+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
>+ paddd m0, [IDCT_ADD]
>+ paddd m1, [IDCT_ADD]
>+ paddd m8, [IDCT_ADD]
>+ paddd m10, [IDCT_ADD]
>+ paddd m2, m13
>+ paddd m3, m12
>+ paddd m2, [IDCT_ADD]
>+ paddd m3, [IDCT_ADD]
>+ psubd m4, m13
>+ psubd m6, m12
>+ paddd m4, [IDCT_ADD]
>+ paddd m6, [IDCT_ADD]
>+ mova m15, [rsp + 4 * mmsize]
>+ mova m12, m8
>+ psubd m8, m7
>+ psrad m8, IDCT_SHIFT
>+ mova m11, [rsp + 3 * mmsize]
>+ paddd m15, m0
>+ psrad m15, IDCT_SHIFT
>+ psubd m0, [rsp + 4 * mmsize]
>+ psrad m0, IDCT_SHIFT
>+ paddd m12, m7
>+ paddd m11, m1
>+ mova m14, [rsp + 2 * mmsize]
>+ psrad m11, IDCT_SHIFT
>+ packssdw m15, m11
>+ psubd m1, [rsp + 3 * mmsize]
>+ psrad m1, IDCT_SHIFT
>+ mova m11, [rsp + 1 * mmsize]
>+ paddd m14, m2
>+ psrad m14, IDCT_SHIFT
>+ packssdw m0, m1
>+ psrad m12, IDCT_SHIFT
>+ psubd m2, [rsp + 2 * mmsize]
>+ paddd m11, m3
>+ mova m13, [rsp + 0 * mmsize]
>+ psrad m11, IDCT_SHIFT
>+ packssdw m14, m11
>+ mova m11, m6
>+ psubd m6, m5
>+ paddd m13, m4
>+ psrad m13, IDCT_SHIFT
>+ mova m1, m15
>+ paddd m11, m5
>+ psrad m11, IDCT_SHIFT
>+ packssdw m13, m11
>+ mova m11, m10
>+ psubd m10, m9
>+ psrad m10, IDCT_SHIFT
>+ packssdw m8, m10
>+ psrad m6, IDCT_SHIFT
>+ psubd m4, [rsp + 0 * mmsize]
>+ paddd m11, m9
>+ psrad m11, IDCT_SHIFT
>+ packssdw m12, m11
>+ punpcklwd m1, m14
>+ mova m5, m13
>+ psrad m4, IDCT_SHIFT
>+ packssdw m4, m6
>+ psubd m3, [rsp + 1 * mmsize]
>+ psrad m2, IDCT_SHIFT
>+ mova m6, m8
>+ psrad m3, IDCT_SHIFT
>+ punpcklwd m5, m12
>+ packssdw m2, m3
>+ punpcklwd m6, m4
>+ punpckhwd m8, m4
>+ mova m4, m1
>+ mova m3, m2
>+ punpckhdq m1, m5
>+ punpckldq m4, m5
>+ punpcklwd m3, m0
>+ punpckhwd m2, m0
>+ mova m0, m6
>+ lea r2, [r2 + r2]
>+ lea r4, [r2 + r2]
>+ lea r3, [r4 + r2]
>+ lea r4, [r4 + r3]
>+ lea r0, [r4 + r2 * 2]
>+ movq [r1], m4
>+ punpckhwd m15, m14
>+ movhps [r1 + r2], m4
>+ punpckhdq m0, m3
>+ movq [r1 + r2 * 2], m1
>+ punpckhwd m13, m12
>+ movhps [r1 + r3], m1
>+ mova m1, m6
>+ punpckldq m1, m3
>+ movq [r1 + 8], m1
>+ movhps [r1 + r2 + 8], m1
>+ movq [r1 + r2 * 2 + 8], m0
>+ movhps [r1 + r3 + 8], m0
>+ mova m0, m15
>+ punpckhdq m15, m13
>+ punpckldq m0, m13
>+ movq [r1 + r2 * 4], m0
>+ movhps [r1 + r4], m0
>+ mova m0, m8
>+ punpckhdq m8, m2
>+ movq [r1 + r3 * 2], m15
>+ punpckldq m0, m2
>+ movhps [r1 + r0], m15
>+ movq [r1 + r2 * 4 + 8], m0
>+ movhps [r1 + r4 + 8], m0
>+ movq [r1 + r3 * 2 + 8], m8
>+ movhps [r1 + r0 + 8], m8
>+ mov rsp, r5
>+ RET
>+
>+%undef IDCT_SHIFT
>+%undef IDCT_ADD
>+%endif
>+
>+;-------------------------------------------------------
> ; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
> ;-------------------------------------------------------
> INIT_XMM ssse3
>diff -r 53f7efef5ebd -r 39dfcbf07ae4 source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h Sat Dec 06 17:17:59 2014 +0900
>+++ b/source/common/x86/dct8.h Mon Dec 08 20:20:10 2014 -0800
>@@ -34,6 +34,9 @@
> void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
>+#if X86_64
>+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t dstStride);
>+#endif
> void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t dstStride);
> void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
> void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t dstStride);
>_______________________________________________
>x265-devel mailing list
><a class="moz-txt-link-abbreviated" href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a>
><a class="moz-txt-link-freetext" href="https://mailman.videolan.org/listinfo/x265-devel">https://mailman.videolan.org/listinfo/x265-devel</a>
</pre>
</div>
<br>
<fieldset class="mimeAttachmentHeader"></fieldset>
<br>
<pre wrap="">_______________________________________________
x265-devel mailing list
<a class="moz-txt-link-abbreviated" href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a>
<a class="moz-txt-link-freetext" href="https://mailman.videolan.org/listinfo/x265-devel">https://mailman.videolan.org/listinfo/x265-devel</a>
</pre>
</blockquote>
<br>
</body>
</html>