[x265] [PATCH] asm: intra_allangs4x4 improved by ~61% over SSE4
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Fri Apr 17 14:00:13 CEST 2015
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1429268008 -19800
# Fri Apr 17 16:23:28 2015 +0530
# Node ID c8ea565afa9a8e7934ada36f76a0bb79f34d59b2
# Parent 7be1172ec816298c32f588908e1b6f0fa214d349
asm: intra_allangs4x4 improved by ~61% over SSE4
AVX2:
intra_allangs4x4 31.17x 1070.01 33353.50
SSE4:
intra_allangs4x4 12.04x 2746.58 33061.69
diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 17 16:23:28 2015 +0530
@@ -1909,6 +1909,9 @@
p.cu[BLOCK_32x32].intra_pred[21] = x265_intra_pred_ang32_21_avx2;
p.cu[BLOCK_32x32].intra_pred[18] = x265_intra_pred_ang32_18_avx2;
+ // all_angs primitives
+ p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_avx2;
+
// copy_sp primitives
p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/intrapred.h Fri Apr 17 16:23:28 2015 +0530
@@ -283,4 +283,5 @@
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_32x32_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
+void x265_all_angs_pred_4x4_avx2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
#endif // ifndef X265_INTRAPRED_H
diff -r 7be1172ec816 -r c8ea565afa9a source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm Thu Apr 16 11:38:32 2015 +0530
+++ b/source/common/x86/intrapred8_allangs.asm Fri Apr 17 16:23:28 2015 +0530
@@ -27,6 +27,64 @@
SECTION_RODATA 32
+all_ang4_shuff: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
+ db 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 1, 2, 2, 3, 3, 4, 4, 5
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
+ db 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
+ db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12
+ db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 4, 0, 0, 9, 9, 10, 10, 11
+ db 0, 9, 9, 10, 10, 11, 11, 12, 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11
+ db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 4, 2, 2, 0, 0, 9, 9, 10
+ db 0, 9, 9, 10, 10, 11, 11, 12, 2, 0, 0, 9, 9, 10, 10, 11, 2, 0, 0, 9, 9, 10, 10, 11, 3, 2, 2, 0, 0, 9, 9, 10
+ db 0, 9, 9, 10, 10, 11, 11, 12, 1, 0, 0, 9, 9, 10, 10, 11, 2, 1, 1, 0, 0, 9, 9, 10, 4, 2, 2, 1, 1, 0, 0, 9
+ db 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0, 0, 1, 2, 3, 9, 0, 1, 2, 10, 9, 0, 1, 11, 10, 9, 0
+ db 0, 1, 1, 2, 2, 3, 3, 4, 9, 0, 0, 1, 1, 2, 2, 3, 10, 9, 9, 0, 0, 1, 1, 2, 12, 10, 10, 9, 9, 0, 0, 1
+ db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 11, 10, 10, 0, 0, 1, 1, 2
+ db 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3, 12, 10, 10, 0, 0, 1, 1, 2
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 10, 0, 0, 1, 1, 2, 2, 3, 10, 0, 0, 1, 1, 2, 2, 3
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 12, 0, 0, 1, 1, 2, 2, 3
+ db 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4, 0, 1, 1, 2, 2, 3, 3, 4
+ db 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4
+ db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5
+ db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6
+ db 1, 2, 2, 3, 3, 4, 4, 5, 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6
+ db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7
+ db 1, 2, 2, 3, 3, 4, 4, 5, 2, 3, 3, 4, 4, 5, 5, 6, 3, 4, 4, 5, 5, 6, 6, 7, 4, 5, 5, 6, 6, 7, 7, 8
+ db 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8
+
+all_ang4: db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+ db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+ db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
+ db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
+ db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
+ db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+ db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
+ db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
+ db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+ db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
+ db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+ db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
+ db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
+ db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
+ db 26, 6, 26, 6, 26, 6, 26, 6, 20, 12, 20, 12, 20, 12, 20, 12, 14, 18, 14, 18, 14, 18, 14, 18, 8, 24, 8, 24, 8, 24, 8, 24
+ db 21, 11, 21, 11, 21, 11, 21, 11, 10, 22, 10, 22, 10, 22, 10, 22, 31, 1, 31, 1, 31, 1, 31, 1, 20, 12, 20, 12, 20, 12, 20, 12
+ db 17, 15, 17, 15, 17, 15, 17, 15, 2, 30, 2, 30, 2, 30, 2, 30, 19, 13, 19, 13, 19, 13, 19, 13, 4, 28, 4, 28, 4, 28, 4, 28
+ db 13, 19, 13, 19, 13, 19, 13, 19, 26, 6, 26, 6, 26, 6, 26, 6, 7, 25, 7, 25, 7, 25, 7, 25, 20, 12, 20, 12, 20, 12, 20, 12
+ db 9, 23, 9, 23, 9, 23, 9, 23, 18, 14, 18, 14, 18, 14, 18, 14, 27, 5, 27, 5, 27, 5, 27, 5, 4, 28, 4, 28, 4, 28, 4, 28
+ db 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12
+ db 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24
+ db 30, 2, 30, 2, 30, 2, 30, 2, 28, 4, 28, 4, 28, 4, 28, 4, 26, 6, 26, 6, 26, 6, 26, 6, 24, 8, 24, 8, 24, 8, 24, 8
+ db 27, 5, 27, 5, 27, 5, 27, 5, 22, 10, 22, 10, 22, 10, 22, 10, 17, 15, 17, 15, 17, 15, 17, 15, 12, 20, 12, 20, 12, 20, 12, 20
+ db 23, 9, 23, 9, 23, 9, 23, 9, 14, 18, 14, 18, 14, 18, 14, 18, 5, 27, 5, 27, 5, 27, 5, 27, 28, 4, 28, 4, 28, 4, 28, 4
+ db 19, 13, 19, 13, 19, 13, 19, 13, 6, 26, 6, 26, 6, 26, 6, 26, 25, 7, 25, 7, 25, 7, 25, 7, 12, 20, 12, 20, 12, 20, 12, 20
+ db 15, 17, 15, 17, 15, 17, 15, 17, 30, 2, 30, 2, 30, 2, 30, 2, 13, 19, 13, 19, 13, 19, 13, 19, 28, 4, 28, 4, 28, 4, 28, 4
+ db 11, 21, 11, 21, 11, 21, 11, 21, 22, 10, 22, 10, 22, 10, 22, 10, 1, 31, 1, 31, 1, 31, 1, 31, 12, 20, 12, 20, 12, 20, 12, 20
+ db 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 18, 14, 18, 14, 18, 14, 18, 14, 24, 8, 24, 8, 24, 8, 24, 8
+
+
SECTION .text
; global constant
@@ -23012,6 +23070,324 @@
movu [r0 + 2111 * 16], m4
RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal all_angs_pred_4x4, 4, 4, 6
+
+ mova m5, [pw_1024]
+ lea r2, [all_ang4]
+ lea r3, [all_ang4_shuff]
+
+; mode 2
+
+ vbroadcasti128 m0, [r1 + 9]
+ mova xm1, xm0
+ psrldq xm1, 1
+ pshufb xm1, [r3]
+ movu [r0], xm1
+
+; mode 3
+
+ pshufb m1, m0, [r3 + 1 * mmsize]
+ pmaddubsw m1, [r2]
+ pmulhrsw m1, m5
+
+; mode 4
+
+ pshufb m2, m0, [r3 + 2 * mmsize]
+ pmaddubsw m2, [r2 + 1 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (3 - 2) * 16], m1
+
+; mode 5
+
+ pshufb m1, m0, [r3 + 2 * mmsize]
+ pmaddubsw m1, [r2 + 2 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 6
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 3 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (5 - 2) * 16], m1
+
+ add r3, 4 * mmsize
+ add r2, 4 * mmsize
+
+; mode 7
+
+ pshufb m1, m0, [r3 + 0 * mmsize]
+ pmaddubsw m1, [r2 + 0 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 8
+
+ pshufb m2, m0, [r3 + 1 * mmsize]
+ pmaddubsw m2, [r2 + 1 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (7 - 2) * 16], m1
+
+; mode 9
+
+ pshufb m1, m0, [r3 + 1 * mmsize]
+ pmaddubsw m1, [r2 + 2 * mmsize]
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ vpermq m1, m1, 11011000b
+ movu [r0 + (9 - 2) * 16], xm1
+
+; mode 10
+
+ pshufb xm1, xm0, [r3 + 2 * mmsize]
+ movu [r0 + (10 - 2) * 16], xm1
+
+ pxor xm1, xm1
+ movd xm2, [r1 + 1]
+ pshufd xm3, xm2, 0
+ punpcklbw xm3, xm1
+ pinsrb xm2, [r1], 0
+ pshufb xm4, xm2, xm1
+ punpcklbw xm4, xm1
+ psubw xm3, xm4
+ psraw xm3, 1
+ pshufb xm4, xm0, xm1
+ punpcklbw xm4, xm1
+ paddw xm3, xm4
+ packuswb xm3, xm1
+
+ pextrb [r0 + 128], xm3, 0
+ pextrb [r0 + 132], xm3, 1
+ pextrb [r0 + 136], xm3, 2
+ pextrb [r0 + 140], xm3, 3
+
+; mode 11
+
+ vbroadcasti128 m0, [r1]
+ pshufb m1, m0, [r3 + 3 * mmsize]
+ pmaddubsw m1, [r2 + 3 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 12
+
+ add r2, 4 * mmsize
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 0 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (11 - 2) * 16], m1
+
+; mode 13
+
+ add r3, 4 * mmsize
+
+ pshufb m1, m0, [r3 + 0 * mmsize]
+ pmaddubsw m1, [r2 + 1 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 14
+
+ pshufb m2, m0, [r3 + 1 * mmsize]
+ pmaddubsw m2, [r2 + 2 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (13 - 2) * 16], m1
+
+; mode 15
+
+ pshufb m1, m0, [r3 + 2 * mmsize]
+ pmaddubsw m1, [r2 + 3 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 16
+
+ add r2, 4 * mmsize
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 0 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (15 - 2) * 16], m1
+
+; mode 17
+
+ add r3, 4 * mmsize
+
+ pshufb m1, m0, [r3 + 0 * mmsize]
+ pmaddubsw m1, [r2 + 1 * mmsize]
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ vpermq m1, m1, 11011000b
+
+; mode 18
+
+ pshufb m2, m0, [r3 + 1 * mmsize]
+ vinserti128 m1, m1, xm2, 1
+ movu [r0 + (17 - 2) * 16], m1
+
+; mode 19
+
+ pshufb m1, m0, [r3 + 2 * mmsize]
+ pmaddubsw m1, [r2 + 2 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 20
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 3 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (19 - 2) * 16], m1
+
+; mode 21
+
+ add r2, 4 * mmsize
+ add r3, 4 * mmsize
+
+ pshufb m1, m0, [r3 + 0 * mmsize]
+ pmaddubsw m1, [r2 + 0 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 22
+
+ pshufb m2, m0, [r3 + 1 * mmsize]
+ pmaddubsw m2, [r2 + 1 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (21 - 2) * 16], m1
+
+; mode 23
+
+ pshufb m1, m0, [r3 + 2 * mmsize]
+ pmaddubsw m1, [r2 + 2 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 24
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 3 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (23 - 2) * 16], m1
+
+; mode 25
+
+ add r2, 4 * mmsize
+
+ pshufb m1, m0, [r3 + 3 * mmsize]
+ pmaddubsw m1, [r2 + 0 * mmsize]
+ pmulhrsw m1, m5
+ packuswb m1, m1
+ vpermq m1, m1, 11011000b
+ movu [r0 + (25 - 2) * 16], xm1
+
+; mode 26
+
+ add r3, 4 * mmsize
+
+ pshufb xm1, xm0, [r3 + 0 * mmsize]
+ movu [r0 + (26 - 2) * 16], xm1
+
+ pxor xm1, xm1
+ movd xm2, [r1 + 9]
+ pshufd xm3, xm2, 0
+ punpcklbw xm3, xm1
+ pinsrb xm4, [r1 + 0], 0
+ pshufb xm4, xm1
+ punpcklbw xm4, xm1
+ psubw xm3, xm4
+ psraw xm3, 1
+ psrldq xm2, xm0, 1
+ pshufb xm2, xm1
+ punpcklbw xm2, xm1
+ paddw xm3, xm2
+ packuswb xm3, xm1
+
+ pextrb [r0 + 384], xm3, 0
+ pextrb [r0 + 388], xm3, 1
+ pextrb [r0 + 392], xm3, 2
+ pextrb [r0 + 396], xm3, 3
+
+; mode 27
+
+ pshufb m1, m0, [r3 + 1 * mmsize]
+ pmaddubsw m1, [r2 + 1 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 28
+
+ pshufb m2, m0, [r3 + 1 * mmsize]
+ pmaddubsw m2, [r2 + 2 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (27 - 2) * 16], m1
+
+; mode 29
+
+ pshufb m1, m0, [r3 + 2 * mmsize]
+ pmaddubsw m1, [r2 + 3 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 30
+
+ add r2, 4 * mmsize
+
+ pshufb m2, m0, [r3 + 3 * mmsize]
+ pmaddubsw m2, [r2 + 0 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (29 - 2) * 16], m1
+
+; mode 31
+
+ add r3, 4 * mmsize
+
+ pshufb m1, m0, [r3 + 0 * mmsize]
+ pmaddubsw m1, [r2 + 1 * mmsize]
+ pmulhrsw m1, m5
+
+; mode 32
+
+ pshufb m2, m0, [r3 + 0 * mmsize]
+ pmaddubsw m2, [r2 + 2 * mmsize]
+ pmulhrsw m2, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+ movu [r0 + (31 - 2) * 16], m1
+
+; mode 33
+
+ pshufb m1, m0, [r3 + 1 * mmsize]
+ pmaddubsw m1, [r2 + 3 * mmsize]
+ pmulhrsw m1, m5
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+
+; mode 34
+
+ pshufb m0, [r3 + 2 * mmsize]
+ vinserti128 m1, m1, xm0, 1
+ movu [r0 + (33 - 2) * 16], m1
+ RET
+
;-----------------------------------------------------------------------------
; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list