<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40"><head><meta http-equiv=Content-Type content="text/html; charset=us-ascii"><meta name=Generator content="Microsoft Word 14 (filtered medium)"><style><!--
/* Font Definitions */
@font-face
{font-family:Wingdings;
panose-1:5 0 0 0 0 0 0 0 0 0;}
@font-face
{font-family:SimSun;
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:"MS Gothic";
panose-1:2 11 6 9 7 2 5 8 2 4;}
@font-face
{font-family:"MS Gothic";
panose-1:2 11 6 9 7 2 5 8 2 4;}
@font-face
{font-family:Tahoma;
panose-1:2 11 6 4 3 5 4 4 2 4;}
@font-face
{font-family:"\@SimSun";
panose-1:2 1 6 0 3 1 1 1 1 1;}
@font-face
{font-family:"MS Gothic";
panose-1:2 11 6 9 7 2 5 8 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin:0mm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:SimSun;
mso-fareast-language:ZH-CN;}
a:link, span.MsoHyperlink
{mso-style-priority:99;
color:blue;
text-decoration:underline;}
a:visited, span.MsoHyperlinkFollowed
{mso-style-priority:99;
color:purple;
text-decoration:underline;}
pre
{mso-style-priority:99;
mso-style-link:"HTML \66F8\5F0F\4ED8\304D \(\6587\5B57\)";
margin:0mm;
margin-bottom:.0001pt;
font-size:12.0pt;
font-family:SimSun;
mso-fareast-language:ZH-CN;}
span.HTML
{mso-style-name:"HTML \66F8\5F0F\4ED8\304D \(\6587\5B57\)";
mso-style-priority:99;
mso-style-link:"HTML \66F8\5F0F\4ED8\304D";
font-family:"Courier New";
mso-fareast-language:ZH-CN;}
span.19
{mso-style-type:personal-reply;
font-family:"Arial","sans-serif";
color:#1F497D;}
.MsoChpDefault
{mso-style-type:export-only;
font-family:"Arial","sans-serif";}
@page WordSection1
{size:612.0pt 792.0pt;
margin:99.25pt 30.0mm 30.0mm 30.0mm;}
div.WordSection1
{page:WordSection1;}
/* List Definitions */
@list l0
{mso-list-id:1167555139;
mso-list-type:hybrid;
mso-list-template-ids:-1577561744 -1557901972 67698699 67698701 67698689 67698699 67698701 67698689 67698699 67698701;}
@list l0:level1
{mso-level-number-format:bullet;
mso-level-text:\F0D8;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:18.0pt;
text-indent:-18.0pt;
font-family:Wingdings;
mso-fareast-font-family:"MS Gothic";
mso-bidi-font-family:SimSun;}
@list l0:level2
{mso-level-number-format:bullet;
mso-level-text:\F0D8;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:42.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level3
{mso-level-number-format:bullet;
mso-level-text:\F0B2;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:63.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level4
{mso-level-number-format:bullet;
mso-level-text:\F06C;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:84.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level5
{mso-level-number-format:bullet;
mso-level-text:\F0D8;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:105.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level6
{mso-level-number-format:bullet;
mso-level-text:\F0B2;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:126.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level7
{mso-level-number-format:bullet;
mso-level-text:\F06C;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:147.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level8
{mso-level-number-format:bullet;
mso-level-text:\F0D8;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:168.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
@list l0:level9
{mso-level-number-format:bullet;
mso-level-text:\F0B2;
mso-level-tab-stop:none;
mso-level-number-position:left;
margin-left:189.0pt;
text-indent:-21.0pt;
font-family:Wingdings;}
ol
{margin-bottom:0mm;}
ul
{margin-bottom:0mm;}
--></style><!--[if gte mso 9]><xml>
<o:shapedefaults v:ext="edit" spidmax="1026">
<v:textbox inset="5.85pt,.7pt,5.85pt,.7pt" />
</o:shapedefaults></xml><![endif]--><!--[if gte mso 9]><xml>
<o:shapelayout v:ext="edit">
<o:idmap v:ext="edit" data="1" />
</o:shapelayout></xml><![endif]--></head><body lang=JA link=blue vlink=purple><div class=WordSection1><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'>></span><span lang=EN-US style='color:black'>>- mov byte [rsp], %2/4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'>></span><span lang=EN-US style='color:black'>>+ mov dword [rsp], %2/4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'>></span><span lang=EN-US style='color:black'>Why dword? byte is enough for dynamic range<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'><o:p> </o:p></span></pre><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'>partial write needs read-modify-write.<o:p></o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'>></span><span lang=EN-US style='color:black'>>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'><o:p> </o:p></span></pre><pre><span lang=EN-US style='color:black;mso-fareast-language:JA'>></span><span lang=EN-US style='color:black'>pinsrw have 2 uops, movd to load 4 bytes and drop unused is better.<o:p></o:p></span></pre><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'>thanks.<o:p></o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'>this function is not used, and will be removed.<o:p></o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D;mso-fareast-language:JA'><o:p> </o:p></span></p><p class=MsoNormal><span lang=EN-US style='font-size:10.0pt;font-family:"Arial","sans-serif";color:#1F497D'><o:p> </o:p></span></p><div style='border:none;border-left:solid blue 1.5pt;padding:0mm 0mm 0mm 4.0pt'><div><div style='border:none;border-top:solid #B5C4DF 1.0pt;padding:3.0pt 0mm 0mm 0mm'><p class=MsoNormal><b><span lang=EN-US style='font-size:10.0pt;font-family:"Tahoma","sans-serif";mso-fareast-language:JA'>From:</span></b><span lang=EN-US style='font-size:10.0pt;font-family:"Tahoma","sans-serif";mso-fareast-language:JA'> x265-devel [mailto:x265-devel-bounces@videolan.org] <b>On Behalf Of </b>chen<br><b>Sent:</b> Wednesday, August 06, 2014 3:29 AM<br><b>To:</b> Development for x265<br><b>Subject:</b> Re: [x265] primitives for RExt<o:p></o:p></span></p></div></div><p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p><div><div><p class=MsoNormal><span lang=EN-US style='font-size:10.5pt;font-family:"Arial","sans-serif";color:black'> <o:p></o:p></span></p></div><pre><span lang=EN-US style='color:black'><br>At 2014-08-05 20:48:50,"Satoshi Nakagawa" <<a href="mailto:nakagawa424@oki.com">nakagawa424@oki.com</a>> wrote:<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># HG changeset patch<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># User Satoshi Nakagawa <<a href="mailto:nakagawa424@oki.com">nakagawa424@oki.com</a>><o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># Date 1407242513 -32400<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># Tue Aug 05 21:41:53 2014 +0900<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># Node ID 770c40d768d55e68e76c485d5dc61d014257e789<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>># Parent 0d4723a0080cff763ff20ab9c516c6e082496a0b<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>primitives for RExt<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>><o:p> </o:p></span></pre><pre><span lang=EN-US style='color:black'>>@@ -1494,7 +1599,7 @@<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> ;-----------------------------------------------------------------------------------------------------------------<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> %macro FILTER_VER_CHROMA_SS 4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> INIT_XMM sse2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>-cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+cglobal interp_4tap_vert_%3_%1x%2, 5, 7, %4 ,0-gprsize<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> <o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> add r1d, r1d<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> add r3d, r3d<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>@@ -1508,7 +1613,7 @@<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> lea r6, [tab_ChromaCoeffV + r4]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> %endif<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> <o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>- mov byte [rsp], %2/4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ mov dword [rsp], %2/4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>Why dword? byte is enough for dynamic range<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'> <o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>diff -r 0d4723a0080c -r 770c40d768d5 source/common/x86/pixel-util8.asm<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>--- a/source/common/x86/pixel-util8.asm Tue Aug 05 01:05:47 2014 -0500<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+++ b/source/common/x86/pixel-util8.asm Tue Aug 05 21:41:53 2014 +0900<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>@@ -2878,6 +2878,61 @@<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> RET<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> <o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>> ;-----------------------------------------------------------------------------<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+; void pixel_sub_ps_2x%2(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1)<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+;-----------------------------------------------------------------------------<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+%macro PIXEL_SUB_PS_W2_H2 2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+%if HIGH_BIT_DEPTH<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+INIT_XMM sse2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ add r1, r1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ add r4, r4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ add r5, r5<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ mov r6d, %2/2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+.loop:<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd m0, [r2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd m1, [r3]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd m2, [r2 + r4]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd m3, [r3 + r5]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ dec r6d<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r2, [r2 + r4 * 2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r3, [r3 + r5 * 2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ psubw m0, m1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ psubw m2, m3<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd [r0], m0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd [r0 + r1], m2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r0, [r0 + 2 * r1]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ jnz .loop<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ RET<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+%else<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+INIT_XMM sse4<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+cglobal pixel_sub_ps_2x%2, 6, 7, 4, dest, destride, src0, scr1, srcStride0, srcStride1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ add r1, r1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ mov r6d, %2/2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+.loop:<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pinsrw m0, [r2], 0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pinsrw m1, [r3], 0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pinsrw m2, [r2 + r4], 0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pinsrw m3, [r3 + r5], 0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>pinsrw have 2 uops, movd to load 4 bytes and drop unused is better.<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ dec r6d<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r2, [r2 + r4 * 2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r3, [r3 + r5 * 2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pmovzxbw m0, m0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pmovzxbw m1, m1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pmovzxbw m2, m2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ pmovzxbw m3, m3<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ psubw m0, m1<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ psubw m2, m3<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd [r0], m0<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ movd [r0 + r1], m2<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ lea r0, [r0 + r1 * 2]<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ jnz .loop<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+ RET<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+%endif<o:p></o:p></span></pre><pre><span lang=EN-US style='color:black'>>+%endmacro<o:p></o:p></span></pre></div></div></div></body></html>