[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Mon Apr 13 03:11:49 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1428887468 25200
# Node ID 364b13ff264fc26358879d872817c962303e2150
# Parent 4cccf22b00ee188a72c8dc3896d7dc1613d855ad
asm: intra pred all_angs_pred_4x4 sse2
This replaces c code and is backported from sse4
The processing of modes 10 and 26 were merged and moved to after mode 2
64-bit
./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4 9.89x 6434.99 63671.87
32-bit
./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4 13.38x 6497.50 86943.55
diff -r 4cccf22b00ee -r 364b13ff264f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Sun Apr 12 18:11:08 2015 -0700
@@ -1259,6 +1259,8 @@
p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
+
p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
diff -r 4cccf22b00ee -r 364b13ff264f source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/const-a.asm Sun Apr 12 18:11:08 2015 -0700
@@ -53,6 +53,10 @@
const pb_shuf8x8c, times 1 db 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6
const pb_movemask, times 16 db 0x00
times 16 db 0xFF
+const pb_...000FF00FF, times 2 db 0xff, 0x00
+ times 12 db 0x00
+const pb_...000FF, db 0xff
+ times 15 db 0x00
;; 16-bit constants
@@ -94,6 +98,8 @@
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
+const pw_...FFF00, dw 0x00
+ times 7 dw 0xff
;; 32-bit constants
diff -r 4cccf22b00ee -r 364b13ff264f source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred.h Sun Apr 12 18:11:08 2015 -0700
@@ -277,6 +277,7 @@
void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r 4cccf22b00ee -r 364b13ff264f source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred8_allangs.asm Sun Apr 12 18:11:08 2015 -0700
@@ -34,9 +34,14 @@
; common constant with intrapred8.asm
cextern ang_table
+cextern pw_ang_table
cextern tab_S1
cextern tab_S2
cextern tab_Si
+cextern pw_16
+cextern pb_...000FF
+cextern pb_...000FF00FF
+cextern pw_...FFF00
;-----------------------------------------------------------------------------
@@ -23006,3 +23011,780 @@
palignr m4, m2, m1, 14
movu [r0 + 2111 * 16], m4
RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal all_angs_pred_4x4, 4, 4, 8
+
+; mode 2
+
+ movh m6, [r1 + 9]
+ mova m2, m6
+ psrldq m2, 1
+ movd [r0], m2 ;byte[A, B, C, D]
+ psrldq m2, 1
+ movd [r0 + 4], m2 ;byte[B, C, D, E]
+ psrldq m2, 1
+ movd [r0 + 8], m2 ;byte[C, D, E, F]
+ psrldq m2, 1
+ movd [r0 + 12], m2 ;byte[D, E, F, G]
+
+; mode 10/26
+
+ pxor m7, m7
+ pshufd m5, m6, 0
+ mova [r0 + 128], m5 ;mode 10 byte[9, A, B, C, 9, A, B, C, 9, A, B, C, 9, A, B, C]
+
+ movd m4, [r1 + 1]
+ pshufd m4, m4, 0
+ mova [r0 + 384], m4 ;mode 26 byte[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]
+
+ movd m1, [r1]
+ punpcklbw m1, m7
+ pshuflw m1, m1, 0x00
+ punpcklqdq m1, m1 ;m1 = byte[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+ punpckldq m4, m5
+ punpcklbw m4, m7 ;m4 = word[1, 2, 3, 4, 9, A, B, C]
+ pshuflw m2, m4, 0x00
+ pshufhw m2, m2, 0x00 ;m2 = word[1, 1, 1, 1, 9, 9, 9, 9]
+
+ psubw m4, m1
+ psraw m4, 1
+
+ pshufd m2, m2, q1032 ;m2 = word[9, 9, 9, 9, 1, 1, 1, 1]
+ paddw m4, m2
+ packuswb m4, m4
+
+%if ARCH_X86_64
+ movq r2, m4
+
+ mov [r0 + 128], r2b ;mode 10
+ shr r2, 8
+ mov [r0 + 132], r2b
+ shr r2, 8
+ mov [r0 + 136], r2b
+ shr r2, 8
+ mov [r0 + 140], r2b
+ shr r2, 8
+ mov [r0 + 384], r2b ;mode 26
+ shr r2d, 8
+ mov [r0 + 388], r2b
+ shr r2d, 8
+ mov [r0 + 392], r2b
+ shr r2d, 8
+ mov [r0 + 396], r2b
+
+%else
+ movd r2d, m4
+
+ mov [r0 + 128], r2b ;mode 10
+ shr r2d, 8
+ mov [r0 + 132], r2b
+ shr r2d, 8
+ mov [r0 + 136], r2b
+ shr r2d, 8
+ mov [r0 + 140], r2b
+
+ psrldq m4, 4
+ movd r2d, m4
+
+ mov [r0 + 384], r2b ;mode 26
+ shr r2d, 8
+ mov [r0 + 388], r2b
+ shr r2d, 8
+ mov [r0 + 392], r2b
+ shr r2d, 8
+ mov [r0 + 396], r2b
+%endif
+
+; mode 3
+
+ mova m2, [pw_16]
+ lea r3, [pw_ang_table + 7 * 16]
+ lea r2, [pw_ang_table + 23 * 16]
+ punpcklbw m6, m6
+ psrldq m6, 1
+ movh m1, m6
+ psrldq m6, 2
+ movh m0, m6
+ psrldq m6, 2
+ movh m3, m6
+ psrldq m6, 2
+ punpcklbw m1, m7 ;m1 = word[9, A, A, B, B, C, C, D]
+ punpcklbw m0, m7 ;m0 = word[A, B, B, C, C, D, D, E]
+ punpcklbw m3, m7 ;m3 = word[B, C, C, D, D, E, E, F]
+ punpcklbw m6, m7 ;m6 = word[C, D, D, E, E, F, F, G]
+
+ mova m7, [r2 - 3 * 16]
+
+ pmaddwd m5, m1, [r2 + 3 * 16]
+ pmaddwd m4, m0, m7
+
+ packssdw m5, m4
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m3, [r3 + 7 * 16]
+ pmaddwd m6, [r3 + 1 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 16], m5
+ movd [r0 + 68], m5 ;mode 6 row 1
+ psrldq m5, 4
+ movd [r0 + 76], m5 ;mode 6 row 3
+
+; mode 4
+
+ pmaddwd m4, m0, [r2 + 8 * 16]
+ pmaddwd m6, m3, m7
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m5, m1, [r2 - 2 * 16]
+ pmaddwd m6, m0, [r3 + 3 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m5, m4
+ mova [r0 + 32], m5
+
+; mode 5
+
+ pmaddwd m5, m1, [r2 - 6 * 16]
+ pmaddwd m6, m0, [r3 - 5 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m0, [r2 - 4 * 16]
+ pmaddwd m3, [r3 - 3 * 16]
+
+ packssdw m4, m3
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 48], m5
+
+; mode 6
+
+ pmaddwd m5, m1, [r3 + 6 * 16]
+ pmaddwd m6, m0, [r3 + 0 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m5, m6
+ movd [r0 + 64], m5
+ psrldq m5, 4
+ movd [r0 + 72], m5
+
+; mode 7
+
+ pmaddwd m5, m1, [r3 + 2 * 16]
+ pmaddwd m6, m1, [r2 - 5 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ mova m3, [r2 + 4 * 16]
+ pmaddwd m4, m1, m3
+ pmaddwd m0, [r3 - 3 * 16]
+
+ packssdw m4, m0
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 80], m5
+
+; mode 8
+
+ mova m0, [r3 - 2 * 16]
+ pmaddwd m5, m1, m0
+ pmaddwd m6, m1, [r3 + 3 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r3 + 8 * 16]
+ pmaddwd m7, m1
+
+ packssdw m4, m7
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 96], m5
+
+; mode 9
+
+ pmaddwd m5, m1, [r3 - 5 * 16]
+ pmaddwd m6, m1, [r3 - 3 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r3 - 1 * 16]
+ pmaddwd m6, m1, [r3 + 1 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 112], m5
+
+; mode 11
+
+ movd m5, [r1]
+ punpcklwd m5, m1
+ pand m5, [pb_...000FF00FF]
+ pslldq m1, 4
+ por m1, m5 ;m1 = word[0, 9, 9, A, A, B, B, C]
+
+ pmaddwd m5, m1, [r2 + 7 * 16]
+ pmaddwd m6, m1, [r2 + 5 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, [r2 + 3 * 16]
+ pmaddwd m6, m1, [r2 + 1 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 144], m5
+
+; mode 12
+
+ pmaddwd m3, m1
+ pmaddwd m6, m1, [r2 - 1 * 16]
+
+ packssdw m3, m6
+ paddw m3, m2
+ psraw m3, 5
+
+ pmaddwd m4, m1, [r2 - 6 * 16]
+ pmaddwd m6, m1, [r3 + 5 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m3, m4
+ mova [r0 + 160], m3
+
+; mode 13
+
+ mova m3, m1
+ movd m7, [r1 + 4]
+ punpcklwd m7, m1
+ pand m7, [pb_...000FF00FF]
+ pslldq m3, 4
+ por m3, m7 ;m3 = word[4, 0, 0, 9, 9, A, A, B]
+
+ pmaddwd m5, m1, [r2 + 0 * 16]
+ pmaddwd m6, m1, [r3 + 7 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m1, m0
+ pmaddwd m6, m3, [r2 + 5 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 176], m5
+
+; mode 14
+
+ pmaddwd m5, m1, [r2 - 4 * 16]
+ pmaddwd m6, m1, [r3 - 1 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ movd m6, [r1 + 2]
+ pand m3, [pw_...FFF00]
+ pand m6, [pb_...000FF]
+ por m3, m6 ;m3 = word[2, 0, 0, 9, 9, A, A, B]
+
+ pmaddwd m4, m3, [r2 + 2 * 16]
+ pmaddwd m6, m3, [r3 + 5 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 192], m5
+ psrldq m5, 4
+ movd [r0 + 240], m5 ;mode 17 row 0
+
+; mode 15
+
+ pmaddwd m5, m1, [r3 + 8 * 16]
+ pmaddwd m6, m3, [r2 + 7 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m6, m3, [r3 + 6 * 16]
+
+ mova m0, m3
+ punpcklwd m7, m3
+ pslldq m0, 4
+ pand m7, [pb_...000FF00FF]
+ por m0, m7 ;m0 = word[4, 2, 2, 0, 0, 9, 9, A]
+
+ pmaddwd m4, m0, [r2 + 5 * 16]
+
+ packssdw m6, m4
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m5, m6
+ mova [r0 + 208], m5
+
+; mode 16
+
+ pmaddwd m5, m1, [r3 + 4 * 16]
+ pmaddwd m6, m3, [r2 - 1 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m3, [r3 - 6 * 16]
+
+ movd m6, [r1 + 3]
+ pand m0, [pw_...FFF00]
+ pand m6, [pb_...000FF]
+ por m0, m6 ;m0 = word[3, 2, 2, 0, 0, 9, 9, A]
+
+ pmaddwd m0, [r3 + 5 * 16]
+ packssdw m3, m0
+ paddw m3, m2
+ psraw m3, 5
+
+ packuswb m5, m3
+ mova [r0 + 224], m5
+
+; mode 17
+
+ movd m4, [r1 + 1]
+ punpcklwd m4, m1
+ pand m4, [pb_...000FF00FF]
+ pslldq m1, 4
+ por m1, m4 ;m1 = word[1, 0, 0, 9, 9, A, A, B]
+
+ pmaddwd m6, m1, [r3 + 5 * 16]
+
+ packssdw m6, m6
+ paddw m6, m2
+ psraw m6, 5
+
+ movd m5, [r1 + 2]
+ punpcklwd m5, m1
+ pand m5, [pb_...000FF00FF]
+ pslldq m1, 4
+ por m1, m5 ;m1 = word[2, 1, 1, 0, 0, 9, 9, A]
+
+ pmaddwd m4, m1, [r2 - 5 * 16]
+
+ punpcklwd m7, m1
+ pand m7, [pb_...000FF00FF]
+ pslldq m1, 4
+ por m1, m7 ;m1 = word[4, 2, 2, 1, 1, 0, 0, 9]
+
+ pmaddwd m1, [r2 + 1 * 16]
+ packssdw m4, m1
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m6, m4
+ movd [r0 + 244], m6
+ psrldq m6, 8
+ movh [r0 + 248], m6
+
+; mode 18
+
+ movh m1, [r1]
+ movd [r0 + 256], m1 ;byte[0, 1, 2, 3]
+
+ movh m3, [r1 + 2]
+ punpcklqdq m3, m1
+ psrldq m3, 7
+ movd [r0 + 260], m3 ;byte[2, 1, 0, 9]
+
+ movh m4, [r1 + 3]
+ punpcklqdq m4, m3
+ psrldq m4, 7
+ movd [r0 + 264], m4 ;byte[1, 0, 9, A]
+
+ movh m0, [r1 + 4]
+ punpcklqdq m0, m4
+ psrldq m0, 7
+ movd [r0 + 268], m0 ;byte[0, 9, A, B]
+
+; mode 19
+
+ pxor m7, m7
+ punpcklbw m4, m3
+ punpcklbw m3, m1
+ punpcklbw m1, m1
+ punpcklbw m4, m7 ;m4 = word[A, 9, 9, 0, 0, 1, 1, 2]
+ punpcklbw m3, m7 ;m3 = word[9, 0, 0, 1, 1, 2, 2, 3]
+ psrldq m1, 1
+ punpcklbw m1, m7 ;m1 = word[0, 1, 1, 2, 2, 3, 3, 4]
+
+ pmaddwd m6, m1, [r3 - 1 * 16]
+ pmaddwd m7, m3, [r3 + 5 * 16]
+
+ packssdw m6, m7
+ paddw m6, m2
+ psraw m6, 5
+
+ pmaddwd m5, m4, [r2 - 5 * 16]
+
+ movd m7, [r1 + 12]
+ punpcklwd m7, m4
+ pand m7, [pb_...000FF00FF]
+ pslldq m4, 4
+ por m4, m7 ;m4 = word[C, A, A, 9, 9, 0, 0, 1]
+
+ pmaddwd m4, [r2 + 1 * 16]
+ packssdw m5, m4
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m6, m5
+ mova [r0 + 272], m6
+ movd [r0 + 324], m6 ;mode 22 row 1
+
+; mode 20
+
+ pmaddwd m5, m1, [r3 + 4 * 16]
+
+ movd m4, [r1 + 10]
+ pand m3, [pw_...FFF00]
+ pand m4, [pb_...000FF]
+ por m3, m4 ;m3 = word[A, 0, 0, 1, 1, 2, 2, 3]
+
+ pmaddwd m6, m3, [r2 - 1 * 16]
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ pmaddwd m4, m3, [r3 - 6 * 16]
+
+ punpcklwd m0, m3
+ pand m0, [pb_...000FF00FF]
+ mova m6, m3
+ pslldq m6, 4
+ por m0, m6 ;m0 = word[B, A, A, 0, 0, 1, 1, 2]
+
+ pmaddwd m6, m0, [r3 + 5 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ packuswb m5, m4
+ mova [r0 + 288], m5
+
+; mode 21
+
+ pmaddwd m4, m1, [r3 + 8 * 16]
+ pmaddwd m6, m3, [r2 + 7 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m5, m3, [r3 + 6 * 16]
+
+ pand m0, [pw_...FFF00]
+ pand m7, [pb_...000FF]
+ por m0, m7 ;m0 = word[C, A, A, 0, 0, 1, 1, 2]
+
+ pmaddwd m0, [r2 + 5 * 16]
+ packssdw m5, m0
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m4, m5
+ mova [r0 + 304], m4
+
+; mode 22
+
+ pmaddwd m4, m1, [r2 - 4 * 16]
+ packssdw m4, m4
+ paddw m4, m2
+ psraw m4, 5
+
+ mova m0, [r3 + 5 * 16]
+ pmaddwd m5, m3, [r2 + 2 * 16]
+ pmaddwd m6, m3, m0
+
+ packssdw m5, m6
+ paddw m5, m2
+ psraw m5, 5
+
+ packuswb m4, m5
+ movd [r0 + 320], m4
+ psrldq m4, 8
+ movh [r0 + 328], m4
+
+; mode 23
+
+ pmaddwd m4, m1, [r2 + 0 * 16]
+ pmaddwd m5, m1, [r3 + 7 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 - 2 * 16]
+
+ pand m3, [pw_...FFF00]
+ por m3, m7 ;m3 = word[C, 0, 0, 1, 1, 2, 2, 3]
+
+ pmaddwd m3, [r2 + 5 * 16]
+ packssdw m6, m3
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 336], m4
+
+; mode 24
+
+ pmaddwd m4, m1, [r2 + 4 * 16]
+ pmaddwd m5, m1, [r2 - 1 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r2 - 6 * 16]
+ pmaddwd m0, m1
+
+ packssdw m6, m0
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 352], m4
+
+; mode 25
+
+ pmaddwd m4, m1, [r2 + 7 * 16]
+ pmaddwd m5, m1, [r2 + 5 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r2 + 3 * 16]
+ pmaddwd m1, [r2 + 1 * 16]
+
+ packssdw m6, m1
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 368], m4
+
+; mode 27
+
+ movh m0, [r1 + 1]
+ pxor m7, m7
+ punpcklbw m0, m0
+ psrldq m0, 1
+ movh m1, m0
+ psrldq m0, 2
+ movh m3, m0
+ psrldq m0, 2
+ punpcklbw m1, m7 ;m1 = word[1, 2, 2, 3, 3, 4, 4, 5]
+ punpcklbw m3, m7 ;m3 = word[2, 3, 3, 4, 4, 5, 5, 6]
+ punpcklbw m0, m7 ;m0 = word[3, 4, 4, 5, 5, 6, 6, 7]
+
+ mova m7, [r3 - 3 * 16]
+
+ pmaddwd m4, m1, [r3 - 5 * 16]
+ pmaddwd m5, m1, m7
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 - 1 * 16]
+ pmaddwd m5, m1, [r3 + 1 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 400], m4
+
+; mode 28
+
+ pmaddwd m4, m1, [r3 - 2 * 16]
+ pmaddwd m5, m1, [r3 + 3 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r3 + 8 * 16]
+ pmaddwd m5, m1, [r2 - 3 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 416], m4
+
+; mode 29
+
+ pmaddwd m4, m1, [r3 + 2 * 16]
+ pmaddwd m6, m1, [r2 - 5 * 16]
+
+ packssdw m4, m6
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m1, [r2 + 4 * 16]
+ pmaddwd m5, m3, m7
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 432], m4
+
+; mode 30
+
+ pmaddwd m4, m1, [r3 + 6 * 16]
+ pmaddwd m5, m1, [r2 + 3 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m3, [r3 + 0 * 16]
+ pmaddwd m5, m3, [r2 - 3 * 16]
+
+ packssdw m6, m5
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 448], m4
+ psrldq m4, 4
+ movh [r0 + 496], m4 ;mode 33 row 0
+ psrldq m4, 8
+ movd [r0 + 500], m4 ;mode 33 row 1
+
+; mode 31
+
+ pmaddwd m4, m1, [r2 - 6 * 16]
+ pmaddwd m5, m3, [r3 - 5 * 16]
+
+ packssdw m4, m5
+ paddw m4, m2
+ psraw m4, 5
+
+ pmaddwd m6, m3, [r2 - 4 * 16]
+ pmaddwd m7, m0
+
+ packssdw m6, m7
+ paddw m6, m2
+ psraw m6, 5
+
+ packuswb m4, m6
+ mova [r0 + 464], m4
+
+; mode 32
+
+ pmaddwd m1, [r2 - 2 * 16]
+ pmaddwd m5, m3, [r3 + 3 * 16]
+
+ packssdw m1, m5
+ paddw m1, m2
+ psraw m1, 5
+
+ pmaddwd m3, [r2 + 8 * 16]
+ pmaddwd m5, m0, [r2 - 3 * 16]
+ packssdw m3, m5
+ paddw m3, m2
+ psraw m3, 5
+
+ packuswb m1, m3
+ mova [r0 + 480], m1
+
+; mode 33
+
+ pmaddwd m0, [r3 + 7 * 16]
+ pxor m7, m7
+ movh m4, [r1 + 4]
+ punpcklbw m4, m4
+ psrldq m4, 1
+ punpcklbw m4, m7
+
+ pmaddwd m4, [r3 + 1 * 16]
+
+ packssdw m0, m4
+ paddw m0, m2
+ psraw m0, 5
+
+ packuswb m0, m0
+ movh [r0 + 504], m0
+
+; mode 34
+
+ movh m7, [r1 + 2]
+ movd [r0 + 512], m7 ;byte[2, 3, 4, 5]
+
+ psrldq m7, 1
+ movd [r0 + 516], m7 ;byte[3, 4, 5, 6]
+
+ psrldq m7, 1
+ movd [r0 + 520], m7 ;byte[4, 5, 6, 7]
+
+ psrldq m7, 1
+ movd [r0 + 524], m7 ;byte[5, 6, 7, 8]
+
+RET
\ No newline at end of file
More information about the x265-devel
mailing list