[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Sat Apr 11 03:58:38 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1428717487 25200
# Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
# Parent ee76a15fa312ac59549965821d9cbff03237226f
asm: intra pred all_angs_pred_4x4 sse2
This replaces c code and is backported from sse4
The processing of modes 10 and 26 were merged and moved to after mode 2
64-bit
./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4 9.99x 6449.98 64435.56
32-bit
./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4 13.31x 6512.49 86709.86
diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 18:58:07 2015 -0700
@@ -1259,6 +1259,8 @@
p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
+
p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/const-a.asm Fri Apr 10 18:58:07 2015 -0700
@@ -53,6 +53,10 @@
const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
const pb_movemask, times 16 db 0x00
times 16 db 0xFF
+const pb_0000000000000F0F, times 2 db 0xff, 0x00
+ times 14 db 0x00
+const pb_000000000000000F, db 0xff
+ times 15 db 0x00
;; 16-bit constants
@@ -94,6 +98,8 @@
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+const pw_FFFFFFF0, dw 0x00
+ times 7 dw 0xff
;; 32-bit constants
diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/intrapred.h Fri Apr 10 18:58:07 2015 -0700
@@ -275,6 +275,7 @@
void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:58:07 2015 -0700
@@ -34,10 +34,17 @@
; common constant with intrapred8.asm
cextern ang_table
+cextern pw_ang_table
cextern tab_S1
cextern tab_S2
cextern tab_Si
+; constants from const-a.asm
+cextern pw_16
+cextern pb_000000000000000F
+cextern pb_0000000000000F0F
+cextern pw_FFFFFFF0
+
;-----------------------------------------------------------------------------
; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
@@ -23006,3 +23013,780 @@
palignr m4, m2, m1, 14
movu [r0 + 2111 * 16], m4
RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal all_angs_pred_4x4, 4, 4, 8
+
+; mode 2
+
+ movh m6, [r1 + 9]
+ movh m2, m6
+ psrldq m2, 1
+ movd [r0], m2
+ psrldq m2, 1
+ movd [r0 + 4], m2
+ psrldq m2, 1
+ movd [r0 + 8], m2
+ psrldq m2, 1
+ movd [r0 + 12], m2
+
+; mode 10/26
+
+ pxor m7, m7
+ pshufd m5, m6, 0
+ movu [r0 + 128], m5 ;mode 10
+
+ movd m4, [r1 + 1]
+ pshufd m4, m4, 0
+ movu [r0 + 384], m4 ;mode 26
+
+ movd m1, [r1]
+ punpcklbw m1, m7
+ pshuflw m1, m1, 0x00
+ punpcklqdq m1, m1
+
+ punpckldq m4, m5
+ punpcklbw m4, m7
+ pshuflw m2, m4, 0x00
+ pshufhw m2, m2, 0x00
+
+ psubw m4, m1
+ psraw m4, 1
+
+ pshufd m2, m2, q1032
+ paddw m4, m2
+ packuswb m4, m4
+
+%if ARCH_X86_64
+ movq r2, m4
+
+ mov [r0 + 128], r2b ;mode 10
+ shr r2, 8
+ mov [r0 + 132], r2b
+ shr r2, 8
+ mov [r0 + 136], r2b
+ shr r2, 8
+ mov [r0 + 140], r2b
+ shr r2, 8
+ mov [r0 + 384], r2b ;mode 26
+ shr r2d, 8
+ mov [r0 + 388], r2b
+ shr r2d, 8
+ mov [r0 + 392], r2b
+ shr r2d, 8
+ mov [r0 + 396], r2b
+
+%else
+ movd r2d, m4
+
+ mov [r0 + 128], r2b ;mode 10
+ shr r2d, 8
+ mov [r0 + 132], r2b
+ shr r2d, 8
+ mov [r0 + 136], r2b
+ shr r2d, 8
+ mov [r0 + 140], r2b
+
+ psrldq m4, 4
+ movd r2d, m4
+
+ mov [r0 + 384], r2b ;mode 26
+ shr r2d, 8
+ mov [r0 + 388], r2b
+ shr r2d, 8
+ mov [r0 + 392], r2b
+ shr r2d, 8
+ mov [r0 + 396], r2b
+%endif
+
+; mode 3
+
+ mova m2, [pw_16]
+ lea r3, [pw_ang_table]
+
+ punpcklbw m6, m6
+ psrldq m6, 1
+ movh m1, m6
+ psrldq m6, 2
+ movh m0, m6
+ psrldq m6, 2
+ movh m3, m6
+ psrldq m6, 2
+ punpcklbw m1, m7
+ punpcklbw m0, m7
+ punpcklbw m3, m7
+ punpcklbw m6, m7
+
+ mova m7, [r3 + 20 * 16]
+
+ pmaddwd m5, m1, [r3 + 26 * 16]
+ pmaddwd m4, m0, m7
+
+ packssdw m5, m4
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m3, [r3 + 14 * 16]
+ pmaddwd m6, [r3 + 8 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 16], m5
+ movd [r0 + 68], m5 ;mode 6 row 1
+ psrldq m5, 4
+ movd [r0 + 76], m5 ;mode 6 row 3
+
+; mode 4
+
+ pmaddwd m4, m0, [r3 + 31 * 16]
+ pmaddwd m6, m3, m7
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m5, m1, [r3 + 21 * 16]
+ pmaddwd m6, m0, [r3 + 10 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m5, m4
+ mova [r0 + 32], m5
+
+; mode 5
+
+ pmaddwd m5, m1, [r3 + 17 * 16]
+ pmaddwd m6, m0, [r3 + 2 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m0, [r3 + 19 * 16]
+ pmaddwd m3, [r3 + 4 * 16]
+
+ packssdw m4, m3
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 48], m5
+
+; mode 6
+
+ pmaddwd m5, m1, [r3 + 13 * 16]
+ pmaddwd m6, m0, [r3 + 7 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m5, m6
+ movd [r0 + 64], m5
+ psrldq m5, 4
+ movd [r0 + 72], m5
+
+; mode 7
+
+ pmaddwd m5, m1, [r3 + 9 * 16]
+ pmaddwd m6, m1, [r3 + 18 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ mova m3, [r3 + 27 * 16]
+ pmaddwd m4, m1, m3
+ pmaddwd m0, [r3 + 4 * 16]
+
+ packssdw m4, m0
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 80], m5
+
+; mode 8
+
+ mova m0, [r3 + 5 * 16]
+ pmaddwd m5, m1, m0
+ pmaddwd m6, m1, [r3 + 10 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r3 + 15 * 16]
+ pmaddwd m7, m1
+
+ packssdw m4, m7
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 96], m5
+
+; mode 9
+
+ pmaddwd m5, m1, [r3 + 2 * 16]
+ pmaddwd m6, m1, [r3 + 4 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r3 + 6 * 16]
+ pmaddwd m6, m1, [r3 + 8 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 112], m5
+
+; mode 11
+
+ movd m5, [r1]
+ punpcklwd m5, m1
+ pand m5, [pb_0000000000000F0F]
+ pslldq m1, 4
+ por m1, m5
+
+ pmaddwd m5, m1, [r3 + 30 * 16]
+ pmaddwd m6, m1, [r3 + 28 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r3 + 26 * 16]
+ pmaddwd m6, m1, [r3 + 24 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 144], m5
+
+; mode 12
+
+ pmaddwd m3, m1
+ pmaddwd m6, m1, [r3 + 22 * 16]
+
+ packssdw m3, m6
+ paddw m3, m2
+ psraw m3, 5
+
+ pmaddwd m4, m1, [r3 + 17 * 16]
+ pmaddwd m6, m1, [r3 + 12 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m3, m4
+ mova [r0 + 160], m3
+
+; mode 13
+
+ mova m3, m1
+ movd m7, [r1 + 4]
+ punpcklwd m7, m1
+ pand m7, [pb_0000000000000F0F]
+ pslldq m3, 4
+ por m3, m7
+
+ pmaddwd m5, m1, [r3 + 23 * 16]
+ pmaddwd m6, m1, [r3 + 14 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, m0
+ pmaddwd m6, m3, [r3 + 28 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 176], m5
+
+; mode 14
+
+ pmaddwd m5, m1, [r3 + 19 * 16]
+ pmaddwd m6, m1, [r3 + 6 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ movd m6, [r1 + 2]
+ pand m3, [pw_FFFFFFF0]
+ pand m6, [pb_000000000000000F]
+ por m3, m6
+
+ pmaddwd m4, m3, [r3 + 25 * 16]
+ pmaddwd m6, m3, [r3 + 12 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 192], m5
+ psrldq m5, 4
+ movd [r0 + 240], m5 ;mode 17 row 0
+
+; mode 15
+
+ pmaddwd m5, m1, [r3 + 15 * 16]
+ pmaddwd m6, m3, [r3 + 30 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m6, m3, [r3 + 13 * 16]
+
+ mova m0, m3
+ punpcklwd m7, m3
+ pslldq m0, 4
+ pand m7, [pb_0000000000000F0F]
+ por m0, m7
+
+ pmaddwd m4, m0, [r3 + 28 * 16]
+
+ packssdw m6, m4
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m5, m6
+ mova [r0 + 208], m5
+
+; mode 16
+
+ pmaddwd m5, m1, [r3 + 11 * 16]
+ pmaddwd m6, m3, [r3 + 22 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m3, [r3 + 1 * 16]
+
+ movd m6, [r1 + 3]
+ pand m0, [pw_FFFFFFF0]
+ pand m6, [pb_000000000000000F]
+ por m0, m6
+
+ pmaddwd m0, [r3 + 12 * 16]
+ packssdw m3, m0
+ paddw m3, m2
+ psraw m3, 5
+
+ packuswb m5, m3
+ mova [r0 + 224], m5
+
+; mode 17
+
+ movd m4, [r1 + 1]
+ punpcklwd m4, m1
+ pand m4, [pb_0000000000000F0F]
+ pslldq m1, 4
+ por m1, m4
+
+ pmaddwd m6, m1, [r3 + 12 * 16]
+
+ packssdw m6, m6
+ paddw m6, m2
+ psraw m6, 5
+
+ movh m5, [r1 + 2]
+ punpcklwd m5, m1
+ pand m5, [pb_0000000000000F0F]
+ pslldq m1, 4
+ por m1, m5
+
+ pmaddwd m4, m1, [r3 + 18 * 16]
+
+ punpcklwd m7, m1
+ pand m7, [pb_0000000000000F0F]
+ pslldq m1, 4
+ por m1, m7
+
+ pmaddwd m1, [r3 + 24 * 16]
+ packssdw m4, m1
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m6, m4
+ movd [r0 + 244], m6
+ psrldq m6, 8
+ movh [r0 + 248], m6
+
+; mode 18
+
+ movh m1, [r1]
+ movd [r0 + 256], m1
+
+ movh m3, [r1 + 2]
+ punpcklqdq m3, m1
+ psrldq m3, 7
+ movd [r0 + 260], m3
+
+ movh m4, [r1 + 3]
+ punpcklqdq m4, m3
+ psrldq m4, 7
+ movd [r0 + 264], m4
+
+ movh m0, [r1 + 4]
+ punpcklqdq m0, m4
+ psrldq m0, 7
+ movd [r0 + 268], m0
+
+; mode 19
+
+ pxor m7, m7
+ punpcklbw m4, m3
+ punpcklbw m3, m1
+ punpcklbw m1, m1
+ punpcklbw m4, m7
+ punpcklbw m3, m7
+ psrldq m1, 1
+ punpcklbw m1, m7
+
+ pmaddwd m6, m1, [r3 + 6 * 16]
+ pmaddwd m7, m3, [r3 + 12 * 16]
+
+ packssdw m6, m7
+ paddw m6, m2
+ psraw m6, 5
+
+ pmaddwd m5, m4, [r3 + 18 * 16]
+
+ movd m7, [r1 + 12]
+ punpcklwd m7, m4
+ pand m7, [pb_0000000000000F0F]
+ pslldq m4, 4
+ por m4, m7
+
+ pmaddwd m4, [r3 + 24 * 16]
+ packssdw m5, m4
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m6, m5
+ mova [r0 + 272], m6
+ movd [r0 + 324], m6 ;mode 22 row 1
+
+; mode 20
+
+ pmaddwd m5, m1, [r3 + 11 * 16]
+
+ movd m4, [r1 + 10]
+ pand m3, [pw_FFFFFFF0]
+ pand m4, [pb_000000000000000F]
+ por m3, m4
+
+ pmaddwd m6, m3, [r3 + 22 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m3, [r3 + 1 * 16]
+
+ punpcklwd m0, m3
+ pand m0, [pb_0000000000000F0F]
+ mova m6, m3
+ pslldq m6, 4
+ por m0, m6
+
+ pmaddwd m6, m0, [r3 + 12 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 288], m5
+
+; mode 21
+
+ pmaddwd m4, m1, [r3 + 15 * 16]
+ pmaddwd m6, m3, [r3 + 30 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m5, m3, [r3 + 13 * 16]
+
+ pand m0, [pw_FFFFFFF0]
+ pand m7, [pb_000000000000000F]
+ por m0, m7
+
+ pmaddwd m0, [r3 + 28 * 16]
+ packssdw m5, m0
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m4, m5
+ mova [r0 + 304], m4
+
+; mode 22
+
+ pmaddwd m4, m1, [r3 + 19 * 16]
+ packssdw m4, m4
+ paddw m4, m2
+ psraw m4, 5
+
+ mova m0, [r3 + 12 * 16]
+ pmaddwd m5, m3, [r3 + 25 * 16]
+ pmaddwd m6, m3, m0
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m4, m5
+ movd [r0 + 320], m4
+ psrldq m4, 8
+ movh [r0 + 328], m4
+
+; mode 23
+
+ pmaddwd m4, m1, [r3 + 23 * 16]
+ pmaddwd m5, m1, [r3 + 14 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 5 * 16]
+
+ pand m3, [pw_FFFFFFF0]
+ por m3, m7
+
+ pmaddwd m3, [r3 + 28 * 16]
+ packssdw m6, m3
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 336], m4
+
+; mode 24
+
+ pmaddwd m4, m1, [r3 + 27 * 16]
+ pmaddwd m5, m1, [r3 + 22 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 17 * 16]
+ pmaddwd m0, m1
+
+ packssdw m6, m0
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 352], m4
+
+; mode 25
+
+ pmaddwd m4, m1, [r3 + 30 * 16]
+ pmaddwd m5, m1, [r3 + 28 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 26 * 16]
+ pmaddwd m1, [r3 + 24 * 16]
+
+ packssdw m6, m1
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 368], m4
+
+; mode 27
+
+ movh m0, [r1 + 1]
+ pxor m7, m7
+ punpcklbw m0, m0
+ psrldq m0, 1
+ movh m1, m0
+ psrldq m0, 2
+ movh m3, m0
+ psrldq m0, 2
+ punpcklbw m1, m7
+ punpcklbw m3, m7
+ punpcklbw m0, m7
+
+ mova m7, [r3 + 4 * 16]
+
+ pmaddwd m4, m1, [r3 + 2 * 16]
+ pmaddwd m5, m1, m7
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 6 * 16]
+ pmaddwd m5, m1, [r3 + 8 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 400], m4
+
+; mode 28
+
+ pmaddwd m4, m1, [r3 + 5 * 16]
+ pmaddwd m5, m1, [r3 + 10 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 15 * 16]
+ pmaddwd m5, m1, [r3 + 20 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 416], m4
+
+; mode 29
+
+ pmaddwd m4, m1, [r3 + 9 * 16]
+ pmaddwd m6, m1, [r3 + 18 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 27 * 16]
+ pmaddwd m5, m3, m7
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 432], m4
+
+; mode 30
+
+ pmaddwd m4, m1, [r3 + 13 * 16]
+ pmaddwd m5, m1, [r3 + 26 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m3, [r3 + 7 * 16]
+ pmaddwd m5, m3, [r3 + 20 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 448], m4
+ psrldq m4, 4
+ movh [r0 + 496], m4 ;mode 33 row 0
+ psrldq m4, 8
+ movd [r0 + 500], m4 ;mode 33 row 1
+
+; mode 31
+
+ pmaddwd m4, m1, [r3 + 17 * 16]
+ pmaddwd m5, m3, [r3 + 2 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m3, [r3 + 19 * 16]
+ pmaddwd m7, m0;, [r3 + 4 * 16]
+
+ packssdw m6, m7
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 464], m4
+
+; mode 32
+
+ pmaddwd m1, [r3 + 21 * 16]
+ pmaddwd m5, m3, [r3 + 10 * 16]
+
+ packssdw m1, m5
+ paddw m1, m2
+ psraw m1, 5
+
+ pmaddwd m3, [r3 + 31 * 16]
+ pmaddwd m5, m0, [r3 + 20 * 16]
+ packssdw m3, m5
+ paddw m3, m2
+ psraw m3, 5
+
+ packuswb m1, m3
+ mova [r0 + 480], m1
+
+; mode 33
+
+ pmaddwd m0, [r3 + 14 * 16]
+ pxor m7, m7
+ movh m4, [r1 + 4]
+ punpcklbw m4, m4
+ psrldq m4, 1
+ punpcklbw m4, m7
+
+ pmaddwd m4, [r3 + 8 * 16]
+
+ packssdw m0, m4
+ paddw m0, m2
+ psraw m0, 5
+
+ packuswb m0, m0
+ movh [r0 + 504], m0
+
+; mode 34
+
+ movh m7, [r1 + 2]
+ movd [r0 + 512], m7
+
+ psrldq m7, 1
+ movd [r0 + 516], m7
+
+ psrldq m7, 1
+ movd [r0 + 520], m7
+
+ psrldq m7, 1
+ movd [r0 + 524], m7
+
+RET
\ No newline at end of file
More information about the x265-devel
mailing list