<html>
<head>
<meta content="text/html; charset=windows-1252"
http-equiv="Content-Type">
</head>
<body text="#000000" bgcolor="#FFFFFF">
<div class="moz-cite-prefix">On 04/11/2015 12:34 AM, chen wrote:<br>
</div>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<div>add data comment increment readable<br>
</div>
</div>
</blockquote>
Can you explain a little more about this?<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<div>some suggest inline below</div>
</div>
</blockquote>
responses below<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
At 2015-04-11 09:58:38,<a class="moz-txt-link-abbreviated" href="mailto:dtyx265@gmail.com">dtyx265@gmail.com</a> wrote:
># HG changeset patch
># User David T Yuen <a class="moz-txt-link-rfc2396E" href="mailto:dtyx265@gmail.com"><dtyx265@gmail.com></a>
># Date 1428717487 25200
># Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
># Parent ee76a15fa312ac59549965821d9cbff03237226f
>asm: intra pred all_angs_pred_4x4 sse2
>
>This replaces c code and is backported from sse4
>The processing of modes 10 and 26 were merged and moved to after mode 2
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 9.99x 6449.98 64435.56
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 13.31x 6512.49 86709.86
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 18:58:07 2015 -0700
>@@ -1259,6 +1259,8 @@
> p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
> p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
>
>+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
>+
> p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/const-a.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -53,6 +53,10 @@
> const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
> const pb_movemask, times 16 db 0x00
> times 16 db 0xFF
>+const pb_0000000000000F0F, times 2 db 0xff, 0x00
constant name mistake
</pre>
</div>
</blockquote>
I was trying to keep it short but if you prefer
pb_00000000000000000000000000FF00FF I can do that. I can do the same
for the other constants.<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
>+ times 14 db 0x00</pre>
</div>
</blockquote>
Also, I should have made this 12, not 14<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
>+const pb_000000000000000F, db 0xff
>+ times 15 db 0x00
>
> ;; 16-bit constants
>
>@@ -94,6 +98,8 @@
> const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
> const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
> const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
>+const pw_FFFFFFF0, dw 0x00
>+ times 7 dw 0xff
>
>
> ;; 32-bit constants
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred.h Fri Apr 10 18:58:07 2015 -0700
>@@ -275,6 +275,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
>--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -34,10 +34,17 @@
>
> ; common constant with intrapred8.asm
> cextern ang_table
>+cextern pw_ang_table
> cextern tab_S1
> cextern tab_S2
> cextern tab_Si
>
>+; constants from const-a.asm
>+cextern pw_16
>+cextern pb_000000000000000F
>+cextern pb_0000000000000F0F
>+cextern pw_FFFFFFF0
>+
>
> ;-----------------------------------------------------------------------------
> ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>@@ -23006,3 +23013,780 @@
> palignr m4, m2, m1, 14
> movu [r0 + 2111 * 16], m4
> RET
>+
>+;-----------------------------------------------------------------------------
>+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal all_angs_pred_4x4, 4, 4, 8
>+
>+; mode 2
>+
>+ movh m6, [r1 + 9]
>+ movh m2, m6
mova mapping to register rename
</pre>
</div>
</blockquote>
Will do.<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
>+ psrldq m2, 1
>+ movd [r0], m2
>+ psrldq m2, 1
>+ movd [r0 + 4], m2
>+ psrldq m2, 1
>+ movd [r0 + 8], m2
>+ psrldq m2, 1
>+ movd [r0 + 12], m2
>+
>+; mode 10/26
>+
>+ pxor m7, m7
>+ pshufd m5, m6, 0
>+ movu [r0 + 128], m5 ;mode 10
>+
>+ movd m4, [r1 + 1]
>+ pshufd m4, m4, 0
>+ movu [r0 + 384], m4 ;mode 26
>+
>+ movd m1, [r1]
>+ punpcklbw m1, m7
>+ pshuflw m1, m1, 0x00
>+ punpcklqdq m1, m1
>+
>+ punpckldq m4, m5
>+ punpcklbw m4, m7
>+ pshuflw m2, m4, 0x00
>+ pshufhw m2, m2, 0x00
>+
>+ psubw m4, m1
>+ psraw m4, 1
>+
>+ pshufd m2, m2, q1032
>+ paddw m4, m2
>+ packuswb m4, m4
>+
>+%if ARCH_X86_64
>+ movq r2, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2, 8
>+ mov [r0 + 132], r2b
>+ shr r2, 8
>+ mov [r0 + 136], r2b
>+ shr r2, 8
>+ mov [r0 + 140], r2b
>+ shr r2, 8
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+
>+%else
>+ movd r2d, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2d, 8
>+ mov [r0 + 132], r2b
>+ shr r2d, 8
>+ mov [r0 + 136], r2b
>+ shr r2d, 8
>+ mov [r0 + 140], r2b
>+
>+ psrldq m4, 4
>+ movd r2d, m4
>+
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+%endif
>+
>+; mode 3
>+
>+ mova m2, [pw_16]
>+ lea r3, [pw_ang_table]
>+
>+ punpcklbw m6, m6
>+ psrldq m6, 1
>+ movh m1, m6
when we keep MOVH here, we can avoid memory operator in mode 11,13,15,17,etc
>+ psrldq m6, 2
>+ movh m0, m6
>+ psrldq m6, 2
>+ movh m3, m6
>+ psrldq m6, 2
>+ punpcklbw m1, m7
>+ punpcklbw m0, m7
>+ punpcklbw m3, m7
>+ punpcklbw m6, m7
>+
>+ mova m7, [r3 + 20 * 16]
offset more than 128 will generate 4-bytes address code
</pre>
</div>
</blockquote>
I will adjust r3 and use r2<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
>+ pmaddwd m5, m1, [r3 + 26 * 16]
>+ pmaddwd m4, m0, m7
>+
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 14 * 16]
>+ pmaddwd m6, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 16], m5
>+ movd [r0 + 68], m5 ;mode 6 row 1
>+ psrldq m5, 4
>+ movd [r0 + 76], m5 ;mode 6 row 3
>+
>+; mode 4
>+
>+ pmaddwd m4, m0, [r3 + 31 * 16]
>+ pmaddwd m6, m3, m7
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m1, [r3 + 21 * 16]
>+ pmaddwd m6, m0, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 32], m5
>+
>+; mode 5
>+
>+ pmaddwd m5, m1, [r3 + 17 * 16]
>+ pmaddwd m6, m0, [r3 + 2 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m0, [r3 + 19 * 16]
>+ pmaddwd m3, [r3 + 4 * 16]
>+
>+ packssdw m4, m3
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 48], m5
>+
>+; mode 6
>+
>+ pmaddwd m5, m1, [r3 + 13 * 16]
>+ pmaddwd m6, m0, [r3 + 7 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m6
>+ movd [r0 + 64], m5
>+ psrldq m5, 4
>+ movd [r0 + 72], m5
>+
>+; mode 7
>+
>+ pmaddwd m5, m1, [r3 + 9 * 16]
>+ pmaddwd m6, m1, [r3 + 18 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ mova m3, [r3 + 27 * 16]
>+ pmaddwd m4, m1, m3
>+ pmaddwd m0, [r3 + 4 * 16]
>+
>+ packssdw m4, m0
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 80], m5
>+
>+; mode 8
>+
>+ mova m0, [r3 + 5 * 16]
>+ pmaddwd m5, m1, m0
>+ pmaddwd m6, m1, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 15 * 16]
>+ pmaddwd m7, m1
>+
>+ packssdw m4, m7
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 96], m5
>+
>+; mode 9
>+
>+ pmaddwd m5, m1, [r3 + 2 * 16]
>+ pmaddwd m6, m1, [r3 + 4 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 6 * 16]
>+ pmaddwd m6, m1, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 112], m5
>+
>+; mode 11
>+
>+ movd m5, [r1]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
you just want to get lowest 2 Word, and both m1, m5 high QWord are zero, so can replace by PSHUFD
</pre>
</div>
</blockquote>
The high QWord of m1 is not zero but has needed values. This code
changes m1 from DCCBBAA9 to CBBAA990 where each character is the r1
index of the neighboring pixel expanded to 16 bits.<br>
<blockquote
cite="mid:5aed571b.66f.14ca766843f.Coremail.chenm003@163.com"
type="cite">
<div
style="line-height:1.7;color:#000000;font-size:14px;font-family:arial">
<pre>
>+ pslldq m1, 4
>+ por m1, m5
>+
>+ pmaddwd m5, m1, [r3 + 30 * 16]
>+ pmaddwd m6, m1, [r3 + 28 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 26 * 16]
>+ pmaddwd m6, m1, [r3 + 24 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 144], m5
>+
>+; mode 12
>+
>+ pmaddwd m3, m1
>+ pmaddwd m6, m1, [r3 + 22 * 16]
>+
>+ packssdw m3, m6
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ pmaddwd m4, m1, [r3 + 17 * 16]
>+ pmaddwd m6, m1, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m3, m4
>+ mova [r0 + 160], m3
>+
>+; mode 13
>+
>+ mova m3, m1
>+ movd m7, [r1 + 4]
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m3, 4
>+ por m3, m7
>+
>+ pmaddwd m5, m1, [r3 + 23 * 16]
>+ pmaddwd m6, m1, [r3 + 14 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, m0
>+ pmaddwd m6, m3, [r3 + 28 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 176], m5
>+
>+; mode 14
>+
>+ pmaddwd m5, m1, [r3 + 19 * 16]
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ movd m6, [r1 + 2]
>+ pand m3, [pw_FFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m3, m6
>+
>+ pmaddwd m4, m3, [r3 + 25 * 16]
>+ pmaddwd m6, m3, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 192], m5
>+ psrldq m5, 4
>+ movd [r0 + 240], m5 ;mode 17 row 0
>+
>+; mode 15
>+
>+ pmaddwd m5, m1, [r3 + 15 * 16]
>+ pmaddwd m6, m3, [r3 + 30 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m6, m3, [r3 + 13 * 16]
>+
>+ mova m0, m3
>+ punpcklwd m7, m3
>+ pslldq m0, 4
>+ pand m7, [pb_0000000000000F0F]
>+ por m0, m7
>+
>+ pmaddwd m4, m0, [r3 + 28 * 16]
>+
>+ packssdw m6, m4
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m5, m6
>+ mova [r0 + 208], m5
>+
>+; mode 16
>+
>+ pmaddwd m5, m1, [r3 + 11 * 16]
>+ pmaddwd m6, m3, [r3 + 22 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m3, [r3 + 1 * 16]
>+
>+ movd m6, [r1 + 3]
>+ pand m0, [pw_FFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m0, m6
>+
>+ pmaddwd m0, [r3 + 12 * 16]
>+ packssdw m3, m0
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m5, m3
>+ mova [r0 + 224], m5
>+
>+; mode 17
>+
>+ movd m4, [r1 + 1]
>+ punpcklwd m4, m1
>+ pand m4, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m4
>+
>+ pmaddwd m6, m1, [r3 + 12 * 16]
>+
>+ packssdw m6, m6
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ movh m5, [r1 + 2]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m5
>+
>+ pmaddwd m4, m1, [r3 + 18 * 16]
>+
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m7
>+
>+ pmaddwd m1, [r3 + 24 * 16]
>+ packssdw m4, m1
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m6, m4
>+ movd [r0 + 244], m6
>+ psrldq m6, 8
>+ movh [r0 + 248], m6
>+
>+; mode 18
>+
>+ movh m1, [r1]
>+ movd [r0 + 256], m1
>+
>+ movh m3, [r1 + 2]
>+ punpcklqdq m3, m1
>+ psrldq m3, 7
>+ movd [r0 + 260], m3
>+
>+ movh m4, [r1 + 3]
>+ punpcklqdq m4, m3
>+ psrldq m4, 7
>+ movd [r0 + 264], m4
>+
>+ movh m0, [r1 + 4]
>+ punpcklqdq m0, m4
>+ psrldq m0, 7
>+ movd [r0 + 268], m0
>+
>+; mode 19
>+
>+ pxor m7, m7
>+ punpcklbw m4, m3
>+ punpcklbw m3, m1
>+ punpcklbw m1, m1
>+ punpcklbw m4, m7
>+ punpcklbw m3, m7
>+ psrldq m1, 1
>+ punpcklbw m1, m7
>+
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+ pmaddwd m7, m3, [r3 + 12 * 16]
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ pmaddwd m5, m4, [r3 + 18 * 16]
>+
>+ movd m7, [r1 + 12]
>+ punpcklwd m7, m4
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m4, 4
>+ por m4, m7
>+
>+ pmaddwd m4, [r3 + 24 * 16]
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m6, m5
>+ mova [r0 + 272], m6
>+ movd [r0 + 324], m6 ;mode 22 row 1
>+
>+; mode 20
>+
>+ pmaddwd m5, m1, [r3 + 11 * 16]
>+
>+ movd m4, [r1 + 10]
>+ pand m3, [pw_FFFFFFF0]
>+ pand m4, [pb_000000000000000F]
>+ por m3, m4
>+
>+ pmaddwd m6, m3, [r3 + 22 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 1 * 16]
>+
>+ punpcklwd m0, m3
>+ pand m0, [pb_0000000000000F0F]
>+ mova m6, m3
>+ pslldq m6, 4
>+ por m0, m6
>+
>+ pmaddwd m6, m0, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 288], m5
>+
>+; mode 21
>+
>+ pmaddwd m4, m1, [r3 + 15 * 16]
>+ pmaddwd m6, m3, [r3 + 30 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m3, [r3 + 13 * 16]
>+
>+ pand m0, [pw_FFFFFFF0]
>+ pand m7, [pb_000000000000000F]
>+ por m0, m7
>+
>+ pmaddwd m0, [r3 + 28 * 16]
>+ packssdw m5, m0
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ mova [r0 + 304], m4
>+
>+; mode 22
>+
>+ pmaddwd m4, m1, [r3 + 19 * 16]
>+ packssdw m4, m4
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ mova m0, [r3 + 12 * 16]
>+ pmaddwd m5, m3, [r3 + 25 * 16]
>+ pmaddwd m6, m3, m0
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ movd [r0 + 320], m4
>+ psrldq m4, 8
>+ movh [r0 + 328], m4
>+
>+; mode 23
>+
>+ pmaddwd m4, m1, [r3 + 23 * 16]
>+ pmaddwd m5, m1, [r3 + 14 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 5 * 16]
>+
>+ pand m3, [pw_FFFFFFF0]
>+ por m3, m7
>+
>+ pmaddwd m3, [r3 + 28 * 16]
>+ packssdw m6, m3
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 336], m4
>+
>+; mode 24
>+
>+ pmaddwd m4, m1, [r3 + 27 * 16]
>+ pmaddwd m5, m1, [r3 + 22 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 17 * 16]
>+ pmaddwd m0, m1
>+
>+ packssdw m6, m0
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 352], m4
>+
>+; mode 25
>+
>+ pmaddwd m4, m1, [r3 + 30 * 16]
>+ pmaddwd m5, m1, [r3 + 28 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 26 * 16]
>+ pmaddwd m1, [r3 + 24 * 16]
>+
>+ packssdw m6, m1
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 368], m4
>+
>+; mode 27
>+
>+ movh m0, [r1 + 1]
>+ pxor m7, m7
>+ punpcklbw m0, m0
>+ psrldq m0, 1
>+ movh m1, m0
>+ psrldq m0, 2
>+ movh m3, m0
>+ psrldq m0, 2
>+ punpcklbw m1, m7
>+ punpcklbw m3, m7
>+ punpcklbw m0, m7
>+
>+ mova m7, [r3 + 4 * 16]
>+
>+ pmaddwd m4, m1, [r3 + 2 * 16]
>+ pmaddwd m5, m1, m7
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+ pmaddwd m5, m1, [r3 + 8 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 400], m4
>+
>+; mode 28
>+
>+ pmaddwd m4, m1, [r3 + 5 * 16]
>+ pmaddwd m5, m1, [r3 + 10 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 15 * 16]
>+ pmaddwd m5, m1, [r3 + 20 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 416], m4
>+
>+; mode 29
>+
>+ pmaddwd m4, m1, [r3 + 9 * 16]
>+ pmaddwd m6, m1, [r3 + 18 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 27 * 16]
>+ pmaddwd m5, m3, m7
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 432], m4
>+
>+; mode 30
>+
>+ pmaddwd m4, m1, [r3 + 13 * 16]
>+ pmaddwd m5, m1, [r3 + 26 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r3 + 7 * 16]
>+ pmaddwd m5, m3, [r3 + 20 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 448], m4
>+ psrldq m4, 4
>+ movh [r0 + 496], m4 ;mode 33 row 0
>+ psrldq m4, 8
>+ movd [r0 + 500], m4 ;mode 33 row 1
>+
>+; mode 31
>+
>+ pmaddwd m4, m1, [r3 + 17 * 16]
>+ pmaddwd m5, m3, [r3 + 2 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r3 + 19 * 16]
>+ pmaddwd m7, m0;, [r3 + 4 * 16]
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 464], m4
>+
>+; mode 32
>+
>+ pmaddwd m1, [r3 + 21 * 16]
>+ pmaddwd m5, m3, [r3 + 10 * 16]
>+
>+ packssdw m1, m5
>+ paddw m1, m2
>+ psraw m1, 5
>+
>+ pmaddwd m3, [r3 + 31 * 16]
>+ pmaddwd m5, m0, [r3 + 20 * 16]
>+ packssdw m3, m5
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m1, m3
>+ mova [r0 + 480], m1
>+
>+; mode 33
>+
>+ pmaddwd m0, [r3 + 14 * 16]
>+ pxor m7, m7
>+ movh m4, [r1 + 4]
>+ punpcklbw m4, m4
>+ psrldq m4, 1
>+ punpcklbw m4, m7
>+
>+ pmaddwd m4, [r3 + 8 * 16]
>+
>+ packssdw m0, m4
>+ paddw m0, m2
>+ psraw m0, 5
>+
>+ packuswb m0, m0
>+ movh [r0 + 504], m0
>+
>+; mode 34
>+
>+ movh m7, [r1 + 2]
>+ movd [r0 + 512], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 516], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 520], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 524], m7
>+
>+RET
>\ No newline at end of file
>_______________________________________________
>x265-devel mailing list
><a class="moz-txt-link-abbreviated" href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a>
><a class="moz-txt-link-freetext" href="https://mailman.videolan.org/listinfo/x265-devel">https://mailman.videolan.org/listinfo/x265-devel</a>
</pre>
</div>
<br>
<fieldset class="mimeAttachmentHeader"></fieldset>
<br>
<pre wrap="">_______________________________________________
x265-devel mailing list
<a class="moz-txt-link-abbreviated" href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a>
<a class="moz-txt-link-freetext" href="https://mailman.videolan.org/listinfo/x265-devel">https://mailman.videolan.org/listinfo/x265-devel</a>
</pre>
</blockquote>
<br>
</body>
</html>