[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2
chen
chenm003 at 163.com
Tue Apr 14 05:30:12 CEST 2015
right
At 2015-04-14 09:38:55,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1428959599 25200
># Node ID f241399b3494455e4a40b8fcf693e4029b68c347
># Parent 4cccf22b00ee188a72c8dc3896d7dc1613d855ad
>asm: intra pred all_angs_pred_4x4 sse2
>
>This replaces c code and is backported from sse4
>The processing of modes 10 and 26 were merged and moved to after mode 2
>
>The new constants are declared with shortened names
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 9.89x 6434.99 63671.87
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4 13.38x 6497.50 86943.55
>
>diff -r 4cccf22b00ee -r f241399b3494 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp Mon Apr 13 14:13:19 2015 -0700
>@@ -1259,6 +1259,8 @@
> p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
> p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
>
>+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
>+
> p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
>
>diff -r 4cccf22b00ee -r f241399b3494 source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/const-a.asm Mon Apr 13 14:13:19 2015 -0700
>@@ -53,6 +53,10 @@
> const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
> const pb_movemask, times 16 db 0x00
> times 16 db 0xFF
>+const pb_0000000000000F0F, times 2 db 0xff, 0x00
>+ times 12 db 0x00
>+const pb_000000000000000F, db 0xff
>+ times 15 db 0x00
>
> ;; 16-bit constants
>
>@@ -94,6 +98,8 @@
> const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
> const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
> const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
>+const pw_FFFFFFFFFFFFFFF0, dw 0x00
>+ times 7 dw 0xff
>
>
> ;; 32-bit constants
>diff -r 4cccf22b00ee -r f241399b3494 source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/intrapred.h Mon Apr 13 14:13:19 2015 -0700
>@@ -277,6 +277,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r 4cccf22b00ee -r f241399b3494 source/common/x86/intrapred8_allangs.asm
>--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:15:38 2015 -0500
>+++ b/source/common/x86/intrapred8_allangs.asm Mon Apr 13 14:13:19 2015 -0700
>@@ -34,9 +34,14 @@
>
> ; common constant with intrapred8.asm
> cextern ang_table
>+cextern pw_ang_table
> cextern tab_S1
> cextern tab_S2
> cextern tab_Si
>+cextern pw_16
>+cextern pb_000000000000000F
>+cextern pb_0000000000000F0F
>+cextern pw_FFFFFFFFFFFFFFF0
>
>
> ;-----------------------------------------------------------------------------
>@@ -23006,3 +23011,780 @@
> palignr m4, m2, m1, 14
> movu [r0 + 2111 * 16], m4
> RET
>+
>+;-----------------------------------------------------------------------------
>+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal all_angs_pred_4x4, 4, 4, 8
>+
>+; mode 2
>+
>+ movh m6, [r1 + 9]
>+ mova m2, m6
>+ psrldq m2, 1
>+ movd [r0], m2 ;byte[A, B, C, D]
>+ psrldq m2, 1
>+ movd [r0 + 4], m2 ;byte[B, C, D, E]
>+ psrldq m2, 1
>+ movd [r0 + 8], m2 ;byte[C, D, E, F]
>+ psrldq m2, 1
>+ movd [r0 + 12], m2 ;byte[D, E, F, G]
>+
>+; mode 10/26
>+
>+ pxor m7, m7
>+ pshufd m5, m6, 0
>+ mova [r0 + 128], m5 ;mode 10 byte[9, A, B, C, 9, A, B, C, 9, A, B, C, 9, A, B, C]
>+
>+ movd m4, [r1 + 1]
>+ pshufd m4, m4, 0
>+ mova [r0 + 384], m4 ;mode 26 byte[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]
>+
>+ movd m1, [r1]
>+ punpcklbw m1, m7
>+ pshuflw m1, m1, 0x00
>+ punpcklqdq m1, m1 ;m1 = byte[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
>+
>+ punpckldq m4, m5
>+ punpcklbw m4, m7 ;m4 = word[1, 2, 3, 4, 9, A, B, C]
>+ pshuflw m2, m4, 0x00
>+ pshufhw m2, m2, 0x00 ;m2 = word[1, 1, 1, 1, 9, 9, 9, 9]
>+
>+ psubw m4, m1
>+ psraw m4, 1
>+
>+ pshufd m2, m2, q1032 ;m2 = word[9, 9, 9, 9, 1, 1, 1, 1]
>+ paddw m4, m2
>+ packuswb m4, m4
>+
>+%if ARCH_X86_64
>+ movq r2, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2, 8
>+ mov [r0 + 132], r2b
>+ shr r2, 8
>+ mov [r0 + 136], r2b
>+ shr r2, 8
>+ mov [r0 + 140], r2b
>+ shr r2, 8
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+
>+%else
>+ movd r2d, m4
>+
>+ mov [r0 + 128], r2b ;mode 10
>+ shr r2d, 8
>+ mov [r0 + 132], r2b
>+ shr r2d, 8
>+ mov [r0 + 136], r2b
>+ shr r2d, 8
>+ mov [r0 + 140], r2b
>+
>+ psrldq m4, 4
>+ movd r2d, m4
>+
>+ mov [r0 + 384], r2b ;mode 26
>+ shr r2d, 8
>+ mov [r0 + 388], r2b
>+ shr r2d, 8
>+ mov [r0 + 392], r2b
>+ shr r2d, 8
>+ mov [r0 + 396], r2b
>+%endif
>+
>+; mode 3
>+
>+ mova m2, [pw_16]
>+ lea r3, [pw_ang_table + 7 * 16]
>+ lea r2, [pw_ang_table + 23 * 16]
>+ punpcklbw m6, m6
>+ psrldq m6, 1
>+ movh m1, m6
>+ psrldq m6, 2
>+ movh m0, m6
>+ psrldq m6, 2
>+ movh m3, m6
>+ psrldq m6, 2
>+ punpcklbw m1, m7 ;m1 = word[9, A, A, B, B, C, C, D]
>+ punpcklbw m0, m7 ;m0 = word[A, B, B, C, C, D, D, E]
>+ punpcklbw m3, m7 ;m3 = word[B, C, C, D, D, E, E, F]
>+ punpcklbw m6, m7 ;m6 = word[C, D, D, E, E, F, F, G]
>+
>+ mova m7, [r2 - 3 * 16]
>+
>+ pmaddwd m5, m1, [r2 + 3 * 16]
>+ pmaddwd m4, m0, m7
>+
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 + 7 * 16]
>+ pmaddwd m6, [r3 + 1 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 16], m5
>+ movd [r0 + 68], m5 ;mode 6 row 1
>+ psrldq m5, 4
>+ movd [r0 + 76], m5 ;mode 6 row 3
>+
>+; mode 4
>+
>+ pmaddwd m4, m0, [r2 + 8 * 16]
>+ pmaddwd m6, m3, m7
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m1, [r2 - 2 * 16]
>+ pmaddwd m6, m0, [r3 + 3 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 32], m5
>+
>+; mode 5
>+
>+ pmaddwd m5, m1, [r2 - 6 * 16]
>+ pmaddwd m6, m0, [r3 - 5 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m0, [r2 - 4 * 16]
>+ pmaddwd m3, [r3 - 3 * 16]
>+
>+ packssdw m4, m3
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 48], m5
>+
>+; mode 6
>+
>+ pmaddwd m5, m1, [r3 + 6 * 16]
>+ pmaddwd m6, m0, [r3 + 0 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m5, m6
>+ movd [r0 + 64], m5
>+ psrldq m5, 4
>+ movd [r0 + 72], m5
>+
>+; mode 7
>+
>+ pmaddwd m5, m1, [r3 + 2 * 16]
>+ pmaddwd m6, m1, [r2 - 5 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ mova m3, [r2 + 4 * 16]
>+ pmaddwd m4, m1, m3
>+ pmaddwd m0, [r3 - 3 * 16]
>+
>+ packssdw m4, m0
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 80], m5
>+
>+; mode 8
>+
>+ mova m0, [r3 - 2 * 16]
>+ pmaddwd m5, m1, m0
>+ pmaddwd m6, m1, [r3 + 3 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 + 8 * 16]
>+ pmaddwd m7, m1
>+
>+ packssdw m4, m7
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 96], m5
>+
>+; mode 9
>+
>+ pmaddwd m5, m1, [r3 - 5 * 16]
>+ pmaddwd m6, m1, [r3 - 3 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r3 - 1 * 16]
>+ pmaddwd m6, m1, [r3 + 1 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 112], m5
>+
>+; mode 11
>+
>+ movd m5, [r1]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m5 ;m1 = word[0, 9, 9, A, A, B, B, C]
>+
>+ pmaddwd m5, m1, [r2 + 7 * 16]
>+ pmaddwd m6, m1, [r2 + 5 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, [r2 + 3 * 16]
>+ pmaddwd m6, m1, [r2 + 1 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 144], m5
>+
>+; mode 12
>+
>+ pmaddwd m3, m1
>+ pmaddwd m6, m1, [r2 - 1 * 16]
>+
>+ packssdw m3, m6
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ pmaddwd m4, m1, [r2 - 6 * 16]
>+ pmaddwd m6, m1, [r3 + 5 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m3, m4
>+ mova [r0 + 160], m3
>+
>+; mode 13
>+
>+ mova m3, m1
>+ movd m7, [r1 + 4]
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m3, 4
>+ por m3, m7 ;m3 = word[4, 0, 0, 9, 9, A, A, B]
>+
>+ pmaddwd m5, m1, [r2 + 0 * 16]
>+ pmaddwd m6, m1, [r3 + 7 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m1, m0
>+ pmaddwd m6, m3, [r2 + 5 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 176], m5
>+
>+; mode 14
>+
>+ pmaddwd m5, m1, [r2 - 4 * 16]
>+ pmaddwd m6, m1, [r3 - 1 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ movd m6, [r1 + 2]
>+ pand m3, [pw_FFFFFFFFFFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m3, m6 ;m3 = word[2, 0, 0, 9, 9, A, A, B]
>+
>+ pmaddwd m4, m3, [r2 + 2 * 16]
>+ pmaddwd m6, m3, [r3 + 5 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 192], m5
>+ psrldq m5, 4
>+ movd [r0 + 240], m5 ;mode 17 row 0
>+
>+; mode 15
>+
>+ pmaddwd m5, m1, [r3 + 8 * 16]
>+ pmaddwd m6, m3, [r2 + 7 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m6, m3, [r3 + 6 * 16]
>+
>+ mova m0, m3
>+ punpcklwd m7, m3
>+ pslldq m0, 4
>+ pand m7, [pb_0000000000000F0F]
>+ por m0, m7 ;m0 = word[4, 2, 2, 0, 0, 9, 9, A]
>+
>+ pmaddwd m4, m0, [r2 + 5 * 16]
>+
>+ packssdw m6, m4
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m5, m6
>+ mova [r0 + 208], m5
>+
>+; mode 16
>+
>+ pmaddwd m5, m1, [r3 + 4 * 16]
>+ pmaddwd m6, m3, [r2 - 1 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m3, [r3 - 6 * 16]
>+
>+ movd m6, [r1 + 3]
>+ pand m0, [pw_FFFFFFFFFFFFFFF0]
>+ pand m6, [pb_000000000000000F]
>+ por m0, m6 ;m0 = word[3, 2, 2, 0, 0, 9, 9, A]
>+
>+ pmaddwd m0, [r3 + 5 * 16]
>+ packssdw m3, m0
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m5, m3
>+ mova [r0 + 224], m5
>+
>+; mode 17
>+
>+ movd m4, [r1 + 1]
>+ punpcklwd m4, m1
>+ pand m4, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m4 ;m1 = word[1, 0, 0, 9, 9, A, A, B]
>+
>+ pmaddwd m6, m1, [r3 + 5 * 16]
>+
>+ packssdw m6, m6
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ movd m5, [r1 + 2]
>+ punpcklwd m5, m1
>+ pand m5, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m5 ;m1 = word[2, 1, 1, 0, 0, 9, 9, A]
>+
>+ pmaddwd m4, m1, [r2 - 5 * 16]
>+
>+ punpcklwd m7, m1
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m1, 4
>+ por m1, m7 ;m1 = word[4, 2, 2, 1, 1, 0, 0, 9]
>+
>+ pmaddwd m1, [r2 + 1 * 16]
>+ packssdw m4, m1
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m6, m4
>+ movd [r0 + 244], m6
>+ psrldq m6, 8
>+ movh [r0 + 248], m6
>+
>+; mode 18
>+
>+ movh m1, [r1]
>+ movd [r0 + 256], m1 ;byte[0, 1, 2, 3]
>+
>+ movh m3, [r1 + 2]
>+ punpcklqdq m3, m1
>+ psrldq m3, 7
>+ movd [r0 + 260], m3 ;byte[2, 1, 0, 9]
>+
>+ movh m4, [r1 + 3]
>+ punpcklqdq m4, m3
>+ psrldq m4, 7
>+ movd [r0 + 264], m4 ;byte[1, 0, 9, A]
>+
>+ movh m0, [r1 + 4]
>+ punpcklqdq m0, m4
>+ psrldq m0, 7
>+ movd [r0 + 268], m0 ;byte[0, 9, A, B]
>+
>+; mode 19
>+
>+ pxor m7, m7
>+ punpcklbw m4, m3
>+ punpcklbw m3, m1
>+ punpcklbw m1, m1
>+ punpcklbw m4, m7 ;m4 = word[A, 9, 9, 0, 0, 1, 1, 2]
>+ punpcklbw m3, m7 ;m3 = word[9, 0, 0, 1, 1, 2, 2, 3]
>+ psrldq m1, 1
>+ punpcklbw m1, m7 ;m1 = word[0, 1, 1, 2, 2, 3, 3, 4]
>+
>+ pmaddwd m6, m1, [r3 - 1 * 16]
>+ pmaddwd m7, m3, [r3 + 5 * 16]
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ pmaddwd m5, m4, [r2 - 5 * 16]
>+
>+ movd m7, [r1 + 12]
>+ punpcklwd m7, m4
>+ pand m7, [pb_0000000000000F0F]
>+ pslldq m4, 4
>+ por m4, m7 ;m4 = word[C, A, A, 9, 9, 0, 0, 1]
>+
>+ pmaddwd m4, [r2 + 1 * 16]
>+ packssdw m5, m4
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m6, m5
>+ mova [r0 + 272], m6
>+ movd [r0 + 324], m6 ;mode 22 row 1
>+
>+; mode 20
>+
>+ pmaddwd m5, m1, [r3 + 4 * 16]
>+
>+ movd m4, [r1 + 10]
>+ pand m3, [pw_FFFFFFFFFFFFFFF0]
>+ pand m4, [pb_000000000000000F]
>+ por m3, m4 ;m3 = word[A, 0, 0, 1, 1, 2, 2, 3]
>+
>+ pmaddwd m6, m3, [r2 - 1 * 16]
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ pmaddwd m4, m3, [r3 - 6 * 16]
>+
>+ punpcklwd m0, m3
>+ pand m0, [pb_0000000000000F0F]
>+ mova m6, m3
>+ pslldq m6, 4
>+ por m0, m6 ;m0 = word[B, A, A, 0, 0, 1, 1, 2]
>+
>+ pmaddwd m6, m0, [r3 + 5 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ packuswb m5, m4
>+ mova [r0 + 288], m5
>+
>+; mode 21
>+
>+ pmaddwd m4, m1, [r3 + 8 * 16]
>+ pmaddwd m6, m3, [r2 + 7 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m5, m3, [r3 + 6 * 16]
>+
>+ pand m0, [pw_FFFFFFFFFFFFFFF0]
>+ pand m7, [pb_000000000000000F]
>+ por m0, m7 ;m0 = word[C, A, A, 0, 0, 1, 1, 2]
>+
>+ pmaddwd m0, [r2 + 5 * 16]
>+ packssdw m5, m0
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ mova [r0 + 304], m4
>+
>+; mode 22
>+
>+ pmaddwd m4, m1, [r2 - 4 * 16]
>+ packssdw m4, m4
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ mova m0, [r3 + 5 * 16]
>+ pmaddwd m5, m3, [r2 + 2 * 16]
>+ pmaddwd m6, m3, m0
>+
>+ packssdw m5, m6
>+ paddw m5, m2
>+ psraw m5, 5
>+
>+ packuswb m4, m5
>+ movd [r0 + 320], m4
>+ psrldq m4, 8
>+ movh [r0 + 328], m4
>+
>+; mode 23
>+
>+ pmaddwd m4, m1, [r2 + 0 * 16]
>+ pmaddwd m5, m1, [r3 + 7 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 - 2 * 16]
>+
>+ pand m3, [pw_FFFFFFFFFFFFFFF0]
>+ por m3, m7 ;m3 = word[C, 0, 0, 1, 1, 2, 2, 3]
>+
>+ pmaddwd m3, [r2 + 5 * 16]
>+ packssdw m6, m3
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 336], m4
>+
>+; mode 24
>+
>+ pmaddwd m4, m1, [r2 + 4 * 16]
>+ pmaddwd m5, m1, [r2 - 1 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r2 - 6 * 16]
>+ pmaddwd m0, m1
>+
>+ packssdw m6, m0
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 352], m4
>+
>+; mode 25
>+
>+ pmaddwd m4, m1, [r2 + 7 * 16]
>+ pmaddwd m5, m1, [r2 + 5 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r2 + 3 * 16]
>+ pmaddwd m1, [r2 + 1 * 16]
>+
>+ packssdw m6, m1
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 368], m4
>+
>+; mode 27
>+
>+ movh m0, [r1 + 1]
>+ pxor m7, m7
>+ punpcklbw m0, m0
>+ psrldq m0, 1
>+ movh m1, m0
>+ psrldq m0, 2
>+ movh m3, m0
>+ psrldq m0, 2
>+ punpcklbw m1, m7 ;m1 = word[1, 2, 2, 3, 3, 4, 4, 5]
>+ punpcklbw m3, m7 ;m3 = word[2, 3, 3, 4, 4, 5, 5, 6]
>+ punpcklbw m0, m7 ;m0 = word[3, 4, 4, 5, 5, 6, 6, 7]
>+
>+ mova m7, [r3 - 3 * 16]
>+
>+ pmaddwd m4, m1, [r3 - 5 * 16]
>+ pmaddwd m5, m1, m7
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 - 1 * 16]
>+ pmaddwd m5, m1, [r3 + 1 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 400], m4
>+
>+; mode 28
>+
>+ pmaddwd m4, m1, [r3 - 2 * 16]
>+ pmaddwd m5, m1, [r3 + 3 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r3 + 8 * 16]
>+ pmaddwd m5, m1, [r2 - 3 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 416], m4
>+
>+; mode 29
>+
>+ pmaddwd m4, m1, [r3 + 2 * 16]
>+ pmaddwd m6, m1, [r2 - 5 * 16]
>+
>+ packssdw m4, m6
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m1, [r2 + 4 * 16]
>+ pmaddwd m5, m3, m7
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 432], m4
>+
>+; mode 30
>+
>+ pmaddwd m4, m1, [r3 + 6 * 16]
>+ pmaddwd m5, m1, [r2 + 3 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r3 + 0 * 16]
>+ pmaddwd m5, m3, [r2 - 3 * 16]
>+
>+ packssdw m6, m5
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 448], m4
>+ psrldq m4, 4
>+ movh [r0 + 496], m4 ;mode 33 row 0
>+ psrldq m4, 8
>+ movd [r0 + 500], m4 ;mode 33 row 1
>+
>+; mode 31
>+
>+ pmaddwd m4, m1, [r2 - 6 * 16]
>+ pmaddwd m5, m3, [r3 - 5 * 16]
>+
>+ packssdw m4, m5
>+ paddw m4, m2
>+ psraw m4, 5
>+
>+ pmaddwd m6, m3, [r2 - 4 * 16]
>+ pmaddwd m7, m0
>+
>+ packssdw m6, m7
>+ paddw m6, m2
>+ psraw m6, 5
>+
>+ packuswb m4, m6
>+ mova [r0 + 464], m4
>+
>+; mode 32
>+
>+ pmaddwd m1, [r2 - 2 * 16]
>+ pmaddwd m5, m3, [r3 + 3 * 16]
>+
>+ packssdw m1, m5
>+ paddw m1, m2
>+ psraw m1, 5
>+
>+ pmaddwd m3, [r2 + 8 * 16]
>+ pmaddwd m5, m0, [r2 - 3 * 16]
>+ packssdw m3, m5
>+ paddw m3, m2
>+ psraw m3, 5
>+
>+ packuswb m1, m3
>+ mova [r0 + 480], m1
>+
>+; mode 33
>+
>+ pmaddwd m0, [r3 + 7 * 16]
>+ pxor m7, m7
>+ movh m4, [r1 + 4]
>+ punpcklbw m4, m4
>+ psrldq m4, 1
>+ punpcklbw m4, m7
>+
>+ pmaddwd m4, [r3 + 1 * 16]
>+
>+ packssdw m0, m4
>+ paddw m0, m2
>+ psraw m0, 5
>+
>+ packuswb m0, m0
>+ movh [r0 + 504], m0
>+
>+; mode 34
>+
>+ movh m7, [r1 + 2]
>+ movd [r0 + 512], m7 ;byte[2, 3, 4, 5]
>+
>+ psrldq m7, 1
>+ movd [r0 + 516], m7 ;byte[3, 4, 5, 6]
>+
>+ psrldq m7, 1
>+ movd [r0 + 520], m7 ;byte[4, 5, 6, 7]
>+
>+ psrldq m7, 1
>+ movd [r0 + 524], m7 ;byte[5, 6, 7, 8]
>+
>+RET
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150414/320f0e95/attachment-0001.html>
More information about the x265-devel
mailing list