[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2
chen
chenm003 at 163.com
Sat Apr 11 09:34:15 CEST 2015
add data comment increment readable
some suggest inline below
At 2015-04-11 09:58:38,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1428717487 25200
># Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
># Parent ee76a15fa312ac59549965821d9cbff03237226f
>asm: intra pred all_angs_pred_4x4 sse2
>
>This replaces c code and is backported from sse4
>The processing of modes 10 and 26 were merged and moved to after mode 2
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 9.99x 6449.98 64435.56
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 13.31x 6512.49 86709.86
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 18:58:07 2015 -0700
>@@ -1259,6 +1259,8 @@
> p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
> p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
>
>+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
>+
> p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/const-a.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -53,6 +53,10 @@
> const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
> const pb_movemask, times 16 db 0x00
> times 16 db 0xFF
>+const pb_0000000000000F0F, times 2 db 0xff, 0x00
constant name mistake
>+ times 14 db 0x00
>+const pb_000000000000000F, db 0xff
>+ times 15 db 0x00
>
> ;; 16-bit constants
>
>@@ -94,6 +98,8 @@
> const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
> const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
> const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
>+const pw_FFFFFFF0, dw 0x00
>+ times 7 dw 0xff
>
>
> ;; 32-bit constants
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred.h Fri Apr 10 18:58:07 2015 -0700
>@@ -275,6 +275,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
>--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:58:07 2015 -0700
>@@ -34,10 +34,17 @@
>
> ; common constant with intrapred8.asm
> cextern ang_table
>+cextern pw_ang_table
> cextern tab_S1
> cextern tab_S2
> cextern tab_Si
>
>+; constants from const-a.asm
>+cextern pw_16
>+cextern pb_000000000000000F
>+cextern pb_0000000000000F0F
>+cextern pw_FFFFFFF0
>+
>
> ;-----------------------------------------------------------------------------
> ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>@@ -23006,3 +23013,780 @@
> palignr m4, m2, m1, 14
> movu [r0 + 2111 * 16], m4
> RET
>+
>+;-----------------------------------------------------------------------------
>+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal all_angs_pred_4x4, 4, 4, 8
>+
>+; mode 2
>+
>+ movh m6, [r1 + 9]
>+ movh m2, m6
mova mapping to register rename
>+ psrldq m2, 1
>+ movd [r0], m2
>+ psrldq m2, 1
>+ movd [r0 + 4], m2
>+ psrldq m2, 1
>+ movd [r0 + 8], m2
>+ psrldq m2, 1
>+ movd [r0 + 12], m2
>+
>+; mode 10/26
>+
>+ pxor m7, m7
>+ pshufd m5, m6, 0
>+ movu [r0 + 128], m5 ;mode 10
>+
>+ movd m4, [r1 + 1]
>+ pshufd m4, m4, 0
>+ movu [r0 + 384], m4 ;mode 26
>+
>+ movd m1, [r1]
>+ punpcklbw m1, m7
>+ pshuflw m1, m1, 0x00
>+ punpcklqdq m1, m1
>+
>+ punpckldq m4, m5
>+ punpcklbw m4, m7
>+ pshuflw m2, m4, 0x00
>+ pshufhw m2, m2, 0x00
>+
>+ psubw m4, m1
>+ psraw m4, 1
>+
>+ pshufd m2, m2, q1032
>+ paddw m4, m2
>+ packuswb m4, m4
>+
>+%if ARCH_X86_64
>+ movq r2, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2, 8
>+ mov [r0 + 132], r2b
>+ shr r2, 8
>+ mov [r0 + 136], r2b
>+ shr r2, 8
>+ mov [r0 + 140], r2b
>+ shr r2, 8
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+
>+%else
>+ movd r2d, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2d, 8
>+ mov [r0 + 132], r2b
>+ shr r2d, 8
>+ mov [r0 + 136], r2b
>+ shr r2d, 8
>+ mov [r0 + 140], r2b
>+
>+ psrldq m4, 4
>+ movd r2d, m4
>+
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+%endif
>+
>+; mode 3
>+
>+ mova m2, [pw_16]
>+ lea r3, [pw_ang_table]
>+
>+ punpcklbw m6, m6
>+ psrldq m6, 1
>+ movh m1, m6
when we keep MOVH here, we can avoid memory operator in mode 11,13,15,17,etc
>+ psrldq m6, 2
>+ movh m0, m6
>+ psrldq m6, 2
>+ movh m3, m6
>+ psrldq m6, 2
>+ punpcklbw m1, m7
>+ punpcklbw m0, m7
>+ punpcklbw m3, m7
>+ punpcklbw m6, m7
>+
>+ mova m7, [r3 + 20 * 16]
offset more than 128 will generate 4-bytes address code
>+ pmaddwd m5, m1, [r3 + 26 * 16]
>+ pmaddwd m4, m0, m7
>+
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 14 * 16]
>+ pmaddwd m6, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 16], m5
>+ movd [r0 + 68], m5 ;mode 6 row 1
>+ psrldq m5, 4
>+ movd [r0 + 76], m5 ;mode 6 row 3
>+
>+; mode 4
>+
>+ pmaddwd m4, m0, [r3 + 31 * 16]
>+ pmaddwd m6, m3, m7
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m1, [r3 + 21 * 16]
>+ pmaddwd m6, m0, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 32], m5
>+
>+; mode 5
>+
>+ pmaddwd m5, m1, [r3 + 17 * 16]
>+ pmaddwd m6, m0, [r3 + 2 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m0, [r3 + 19 * 16]
>+ pmaddwd m3, [r3 + 4 * 16]
>+
>+ packssdw m4, m3
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 48], m5
>+
>+; mode 6
>+
>+ pmaddwd m5, m1, [r3 + 13 * 16]
>+ pmaddwd m6, m0, [r3 + 7 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m6
>+ movd [r0 + 64], m5
>+ psrldq m5, 4
>+ movd [r0 + 72], m5
>+
>+; mode 7
>+
>+ pmaddwd m5, m1, [r3 + 9 * 16]
>+ pmaddwd m6, m1, [r3 + 18 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ mova m3, [r3 + 27 * 16]
>+ pmaddwd m4, m1, m3
>+ pmaddwd m0, [r3 + 4 * 16]
>+
>+ packssdw m4, m0
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 80], m5
>+
>+; mode 8
>+
>+ mova m0, [r3 + 5 * 16]
>+ pmaddwd m5, m1, m0
>+ pmaddwd m6, m1, [r3 + 10 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 15 * 16]
>+ pmaddwd m7, m1
>+
>+ packssdw m4, m7
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 96], m5
>+
>+; mode 9
>+
>+ pmaddwd m5, m1, [r3 + 2 * 16]
>+ pmaddwd m6, m1, [r3 + 4 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 6 * 16]
>+ pmaddwd m6, m1, [r3 + 8 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 112], m5
>+
>+; mode 11
>+
>+ movd m5, [r1]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
you just want to get lowest 2 Word, and both m1, m5 high QWord are zero, so can replace by PSHUFD
>+ pslldq m1, 4
>+ por m1, m5
>+
>+ pmaddwd m5, m1, [r3 + 30 * 16]
>+ pmaddwd m6, m1, [r3 + 28 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 26 * 16]
>+ pmaddwd m6, m1, [r3 + 24 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 144], m5
>+
>+; mode 12
>+
>+ pmaddwd m3, m1
>+ pmaddwd m6, m1, [r3 + 22 * 16]
>+
>+ packssdw m3, m6
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ pmaddwd m4, m1, [r3 + 17 * 16]
>+ pmaddwd m6, m1, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m3, m4
>+ mova [r0 + 160], m3
>+
>+; mode 13
>+
>+ mova m3, m1
>+ movd m7, [r1 + 4]
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m3, 4
>+ por m3, m7
>+
>+ pmaddwd m5, m1, [r3 + 23 * 16]
>+ pmaddwd m6, m1, [r3 + 14 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, m0
>+ pmaddwd m6, m3, [r3 + 28 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 176], m5
>+
>+; mode 14
>+
>+ pmaddwd m5, m1, [r3 + 19 * 16]
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ movd m6, [r1 + 2]
>+ pand m3, [pw_FFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m3, m6
>+
>+ pmaddwd m4, m3, [r3 + 25 * 16]
>+ pmaddwd m6, m3, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 192], m5
>+ psrldq m5, 4
>+ movd [r0 + 240], m5 ;mode 17 row 0
>+
>+; mode 15
>+
>+ pmaddwd m5, m1, [r3 + 15 * 16]
>+ pmaddwd m6, m3, [r3 + 30 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m6, m3, [r3 + 13 * 16]
>+
>+ mova m0, m3
>+ punpcklwd m7, m3
>+ pslldq m0, 4
>+ pand m7, [pb_0000000000000F0F]
>+ por m0, m7
>+
>+ pmaddwd m4, m0, [r3 + 28 * 16]
>+
>+ packssdw m6, m4
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m5, m6
>+ mova [r0 + 208], m5
>+
>+; mode 16
>+
>+ pmaddwd m5, m1, [r3 + 11 * 16]
>+ pmaddwd m6, m3, [r3 + 22 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m3, [r3 + 1 * 16]
>+
>+ movd m6, [r1 + 3]
>+ pand m0, [pw_FFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m0, m6
>+
>+ pmaddwd m0, [r3 + 12 * 16]
>+ packssdw m3, m0
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m5, m3
>+ mova [r0 + 224], m5
>+
>+; mode 17
>+
>+ movd m4, [r1 + 1]
>+ punpcklwd m4, m1
>+ pand m4, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m4
>+
>+ pmaddwd m6, m1, [r3 + 12 * 16]
>+
>+ packssdw m6, m6
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ movh m5, [r1 + 2]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m5
>+
>+ pmaddwd m4, m1, [r3 + 18 * 16]
>+
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m7
>+
>+ pmaddwd m1, [r3 + 24 * 16]
>+ packssdw m4, m1
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m6, m4
>+ movd [r0 + 244], m6
>+ psrldq m6, 8
>+ movh [r0 + 248], m6
>+
>+; mode 18
>+
>+ movh m1, [r1]
>+ movd [r0 + 256], m1
>+
>+ movh m3, [r1 + 2]
>+ punpcklqdq m3, m1
>+ psrldq m3, 7
>+ movd [r0 + 260], m3
>+
>+ movh m4, [r1 + 3]
>+ punpcklqdq m4, m3
>+ psrldq m4, 7
>+ movd [r0 + 264], m4
>+
>+ movh m0, [r1 + 4]
>+ punpcklqdq m0, m4
>+ psrldq m0, 7
>+ movd [r0 + 268], m0
>+
>+; mode 19
>+
>+ pxor m7, m7
>+ punpcklbw m4, m3
>+ punpcklbw m3, m1
>+ punpcklbw m1, m1
>+ punpcklbw m4, m7
>+ punpcklbw m3, m7
>+ psrldq m1, 1
>+ punpcklbw m1, m7
>+
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+ pmaddwd m7, m3, [r3 + 12 * 16]
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ pmaddwd m5, m4, [r3 + 18 * 16]
>+
>+ movd m7, [r1 + 12]
>+ punpcklwd m7, m4
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m4, 4
>+ por m4, m7
>+
>+ pmaddwd m4, [r3 + 24 * 16]
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m6, m5
>+ mova [r0 + 272], m6
>+ movd [r0 + 324], m6 ;mode 22 row 1
>+
>+; mode 20
>+
>+ pmaddwd m5, m1, [r3 + 11 * 16]
>+
>+ movd m4, [r1 + 10]
>+ pand m3, [pw_FFFFFFF0]
>+ pand m4, [pb_000000000000000F]
>+ por m3, m4
>+
>+ pmaddwd m6, m3, [r3 + 22 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 1 * 16]
>+
>+ punpcklwd m0, m3
>+ pand m0, [pb_0000000000000F0F]
>+ mova m6, m3
>+ pslldq m6, 4
>+ por m0, m6
>+
>+ pmaddwd m6, m0, [r3 + 12 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 288], m5
>+
>+; mode 21
>+
>+ pmaddwd m4, m1, [r3 + 15 * 16]
>+ pmaddwd m6, m3, [r3 + 30 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m3, [r3 + 13 * 16]
>+
>+ pand m0, [pw_FFFFFFF0]
>+ pand m7, [pb_000000000000000F]
>+ por m0, m7
>+
>+ pmaddwd m0, [r3 + 28 * 16]
>+ packssdw m5, m0
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ mova [r0 + 304], m4
>+
>+; mode 22
>+
>+ pmaddwd m4, m1, [r3 + 19 * 16]
>+ packssdw m4, m4
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ mova m0, [r3 + 12 * 16]
>+ pmaddwd m5, m3, [r3 + 25 * 16]
>+ pmaddwd m6, m3, m0
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ movd [r0 + 320], m4
>+ psrldq m4, 8
>+ movh [r0 + 328], m4
>+
>+; mode 23
>+
>+ pmaddwd m4, m1, [r3 + 23 * 16]
>+ pmaddwd m5, m1, [r3 + 14 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 5 * 16]
>+
>+ pand m3, [pw_FFFFFFF0]
>+ por m3, m7
>+
>+ pmaddwd m3, [r3 + 28 * 16]
>+ packssdw m6, m3
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 336], m4
>+
>+; mode 24
>+
>+ pmaddwd m4, m1, [r3 + 27 * 16]
>+ pmaddwd m5, m1, [r3 + 22 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 17 * 16]
>+ pmaddwd m0, m1
>+
>+ packssdw m6, m0
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 352], m4
>+
>+; mode 25
>+
>+ pmaddwd m4, m1, [r3 + 30 * 16]
>+ pmaddwd m5, m1, [r3 + 28 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 26 * 16]
>+ pmaddwd m1, [r3 + 24 * 16]
>+
>+ packssdw m6, m1
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 368], m4
>+
>+; mode 27
>+
>+ movh m0, [r1 + 1]
>+ pxor m7, m7
>+ punpcklbw m0, m0
>+ psrldq m0, 1
>+ movh m1, m0
>+ psrldq m0, 2
>+ movh m3, m0
>+ psrldq m0, 2
>+ punpcklbw m1, m7
>+ punpcklbw m3, m7
>+ punpcklbw m0, m7
>+
>+ mova m7, [r3 + 4 * 16]
>+
>+ pmaddwd m4, m1, [r3 + 2 * 16]
>+ pmaddwd m5, m1, m7
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 6 * 16]
>+ pmaddwd m5, m1, [r3 + 8 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 400], m4
>+
>+; mode 28
>+
>+ pmaddwd m4, m1, [r3 + 5 * 16]
>+ pmaddwd m5, m1, [r3 + 10 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 15 * 16]
>+ pmaddwd m5, m1, [r3 + 20 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 416], m4
>+
>+; mode 29
>+
>+ pmaddwd m4, m1, [r3 + 9 * 16]
>+ pmaddwd m6, m1, [r3 + 18 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 27 * 16]
>+ pmaddwd m5, m3, m7
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 432], m4
>+
>+; mode 30
>+
>+ pmaddwd m4, m1, [r3 + 13 * 16]
>+ pmaddwd m5, m1, [r3 + 26 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r3 + 7 * 16]
>+ pmaddwd m5, m3, [r3 + 20 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 448], m4
>+ psrldq m4, 4
>+ movh [r0 + 496], m4 ;mode 33 row 0
>+ psrldq m4, 8
>+ movd [r0 + 500], m4 ;mode 33 row 1
>+
>+; mode 31
>+
>+ pmaddwd m4, m1, [r3 + 17 * 16]
>+ pmaddwd m5, m3, [r3 + 2 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r3 + 19 * 16]
>+ pmaddwd m7, m0;, [r3 + 4 * 16]
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 464], m4
>+
>+; mode 32
>+
>+ pmaddwd m1, [r3 + 21 * 16]
>+ pmaddwd m5, m3, [r3 + 10 * 16]
>+
>+ packssdw m1, m5
>+ paddw m1, m2
>+ psraw m1, 5
>+
>+ pmaddwd m3, [r3 + 31 * 16]
>+ pmaddwd m5, m0, [r3 + 20 * 16]
>+ packssdw m3, m5
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m1, m3
>+ mova [r0 + 480], m1
>+
>+; mode 33
>+
>+ pmaddwd m0, [r3 + 14 * 16]
>+ pxor m7, m7
>+ movh m4, [r1 + 4]
>+ punpcklbw m4, m4
>+ psrldq m4, 1
>+ punpcklbw m4, m7
>+
>+ pmaddwd m4, [r3 + 8 * 16]
>+
>+ packssdw m0, m4
>+ paddw m0, m2
>+ psraw m0, 5
>+
>+ packuswb m0, m0
>+ movh [r0 + 504], m0
>+
>+; mode 34
>+
>+ movh m7, [r1 + 2]
>+ movd [r0 + 512], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 516], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 520], m7
>+
>+ psrldq m7, 1
>+ movd [r0 + 524], m7
>+
>+RET
>\ No newline at end of file
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150411/3d369a90/attachment-0001.html>
More information about the x265-devel
mailing list