[x265] [PATCH 20 of 29] 16bpp: cleanup intra_ang8x8, intra_ang16x16 and intra_ang32x32 older asm code
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jan 13 08:11:28 CET 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1420528688 -19800
# Tue Jan 06 12:48:08 2015 +0530
# Node ID 063d9417105a345b169129c3bb2a259d6c7e8b06
# Parent 10b8acec46ea4cdc43023b95e379588e86438601
16bpp: cleanup intra_ang8x8, intra_ang16x16 and intra_ang32x32 older asm code
diff -r 10b8acec46ea -r 063d9417105a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jan 13 11:38:04 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jan 06 12:48:08 2015 +0530
@@ -953,11 +953,6 @@
p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _new_ ## cpu;
-#define SETUP_INTRA_ANG(mode, fno, cpu) \
- p.intra_pred[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _ ## cpu; \
- p.intra_pred[mode][BLOCK_32x32] = x265_intra_pred_ang32_ ## fno ## _ ## cpu;
-
#define SETUP_INTRA_ANG_HIGH(mode, fno, cpu) \
p.intra_pred_new[mode][BLOCK_8x8] = x265_intra_pred_ang8_ ## fno ## _new_ ## cpu; \
p.intra_pred_new[mode][BLOCK_16x16] = x265_intra_pred_ang16_ ## fno ## _new_ ## cpu; \
@@ -1021,21 +1016,6 @@
SETUP_INTRA_ANG_COMMON_NEW(18, 18, cpu);
#define INTRA_ANG_SSE4_HIGH(cpu) \
- SETUP_INTRA_ANG(19, 19, cpu); \
- SETUP_INTRA_ANG(20, 20, cpu); \
- SETUP_INTRA_ANG(21, 21, cpu); \
- SETUP_INTRA_ANG(22, 22, cpu); \
- SETUP_INTRA_ANG(23, 23, cpu); \
- SETUP_INTRA_ANG(24, 24, cpu); \
- SETUP_INTRA_ANG(25, 25, cpu); \
- SETUP_INTRA_ANG(26, 26, cpu); \
- SETUP_INTRA_ANG(27, 27, cpu); \
- SETUP_INTRA_ANG(28, 28, cpu); \
- SETUP_INTRA_ANG(29, 29, cpu); \
- SETUP_INTRA_ANG(30, 30, cpu); \
- SETUP_INTRA_ANG(31, 31, cpu); \
- SETUP_INTRA_ANG(32, 32, cpu); \
- SETUP_INTRA_ANG(33, 33, cpu); \
SETUP_INTRA_ANG4(19, 17, cpu); \
SETUP_INTRA_ANG4(20, 16, cpu); \
SETUP_INTRA_ANG4(21, 15, cpu); \
diff -r 10b8acec46ea -r 063d9417105a source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jan 13 11:38:04 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jan 06 12:48:08 2015 +0530
@@ -1277,3660 +1277,6 @@
movh [r0], m0
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang8_2, 3,4,3
- cmp r4m, byte 34
- cmove r2, r3mp
- add r1, r1
- lea r3, [r1 * 3]
- movu m0, [r2 + 4]
- movu m1, [r2 + 20]
- movu [r0], m0
- palignr m2, m1, m0, 2
- movu [r0 + r1], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 6
- movu [r0 + r3], m2
- lea r0, [r0 + r1 * 4]
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m1, m0, 10
- movu [r0 + r1], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2], m2
- palignr m1, m0, 14
- movu [r0 + r3], m1
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang8_3, 3,5,8
- lea r3, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
- punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
-
- mova m4, m3
- pmaddwd m4, [r3 + 12 * 16] ; [26]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m2, [r3 + 6 * 16] ; [20]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m6, [r3 + 6 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m6, [r3] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m7, [r3]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- pmaddwd m7, [r3 - 6 * 16] ; [ 8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- pmaddwd m3, [r3 - 6 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m7, m3
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- lea r4, [r1 * 3]
- movh [r0], m7
- movhps [r0 + r1], m7
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m3
- movhps [r2 + r4], m3
-
- mova m4, m0
- pmaddwd m4, [r3 - 12 * 16] ; [ 2]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m5
- pmaddwd m2, [r3 - 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m0
- pmaddwd m2, [r3 + 14 * 16] ; [28]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m6, m5
- pmaddwd m6, [r3 + 14 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m6, [r3 + 8 * 16] ; [22]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
- pmaddwd m7, [r3 + 8 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m7, [r3 + 2 * 16] ; [16]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
- pmaddwd m1, [r3 + 2 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m7
- movhps [r0 + r1 + 8], m7
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_4, 3,6,8
- lea r3, [ang_table + 19 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 + 2 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 2 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m2
- pmaddwd m2, [r3 - 9 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m1
- pmaddwd m1, [r3 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 12 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 12 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 + 1 * 16] ; [20]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m1, [r3 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m7
- movhps [r0 + r1], m7
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r5, [r0 + r1 * 4]
- movh [r5], m6
- movhps [r5 + r1], m6
- movh [r5 + r1 * 2], m1
- movhps [r5 + r4], m1
-
- palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- mova m2, m4
- pmaddwd m4, [r3 - 10 * 16] ; [ 9]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- mova m6, m3
- pmaddwd m3, [r3 - 10 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m4, m3
-
- pmaddwd m2, [r3 + 11 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m6, [r3 + 11 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- mova m6, m0
- pmaddwd m6, [r3] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m7, m5
- pmaddwd m7, [r3]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- movh m1, [r2 + 26] ; [16 15 14 13]
- palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m7, [r3 - 11 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
- pmaddwd m1, [r3 - 11 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m7
- movhps [r0 + r1 + 8], m7
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_5, 3,5,8
- lea r3, [ang_table + 13 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 + 4 * 16] ; [17]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 4 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m2
- pmaddwd m2, [r3 - 11 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m1
- pmaddwd m1, [r3 - 11 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 6 * 16] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 6 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 - 9 * 16] ; [4]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m1, [r3 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m7
- movhps [r0 + r1], m7
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m4, [r3 + 8 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m2, [r3 + 8 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- mova m6, m2
- pmaddwd m2, [r3 - 7 * 16] ; [6]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- mova m7, m1
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 10 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 10 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- mova m7, m0
- pmaddwd m7, [r3 - 5 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- mova m1, m5
- pmaddwd m1, [r3 - 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m7
- movhps [r0 + r1 + 8], m7
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_6, 3,5,8
- lea r3, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 - 1 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 12 * 16] ; [26]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 12 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m7, m6
- pmaddwd m6, [r3 - 7 * 16] ; [7]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m7, [r3 + 6 * 16] ; [20]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 + 6 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m7
- movhps [r0 + r1], m7
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- mova m6, m4
- pmaddwd m4, [r3 - 13 * 16] ; [1]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- mova m7, m2
- pmaddwd m2, [r3 - 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pmaddwd m2, m6, [r3] ; [14]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m1, m7, [r3]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 13 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 13 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- pmaddwd m7, [r3 - 6 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- pmaddwd m5, [r3 - 6 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m7, m5
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m7
- movhps [r0 + r1 + 8], m7
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_7, 3,5,8
- lea r3, [ang_table + 18 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 - 9 * 16] ; [9]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3] ; [18]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 9 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 + 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m7, [r3 - 14 * 16] ; [4]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m7
- movhps [r0 + r1], m7
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m4
- pmaddwd m4, [r3 - 5 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m2
- pmaddwd m2, [r3 - 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m1, m7, [r3 + 4 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 13 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 13 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 - 10 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m5, [r3 - 10 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m7, m5
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m7
- punpcklwd m6, m7
-
- punpckldq m7, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m7
- movhps [r0 + r1 + 8], m7
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_8, 3,6,7
- lea r3, [ang_table + 17 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
-
- punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
-
- mova m4, m3
- pmaddwd m4, [r3 - 12 * 16] ; [5]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 7 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 2 * 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 - 2 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 + 3 * 16] ; [20]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r5, [r0 + r1 * 4]
- movh [r5], m6
- movhps [r5 + r1], m6
- movh [r5 + r1 * 2], m1
- movhps [r5 + r4], m1
-
- mova m4, m3
- pmaddwd m4, [r3 + 8 * 16] ; [25]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 8 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 13 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 13 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- movh m1, [r2 + 18] ; [12 11 10 9]
-
- palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m5, m6
- pmaddwd m6, [r3 - 14 * 16] ; [3]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m3, m1
- pmaddwd m1, [r3 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m5, [r3 - 9 * 16] ; [8]
- paddd m5, [pd_16]
- psrld m5, 5
- pmaddwd m3, [r3 - 9 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m5, m3
-
- punpckhwd m3, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m3, m2
- punpckhdq m3, m2
-
- movh [r0 + 8], m5
- movhps [r0 + r1 + 8], m5
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m3
- movhps [r0 + r4 + 8], m3
-
- RET
-
-cglobal intra_pred_ang8_9, 3,5,7
- lea r3, [ang_table + 9 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
-
- punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
-
- mova m4, m3
- pmaddwd m4, [r3 - 7 * 16] ; [2]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 7 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 5 * 16] ; [4]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 3 * 16] ; [6]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 - 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 - 1 * 16] ; [8]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 - 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- mova m4, m3
- pmaddwd m4, [r3 + 1 * 16] ; [10]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 3 * 16] ; [12]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 5 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r3 + 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r3 + 7 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r3 + 7 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_10, 4,5,3
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
- add r1, r1
- lea r4, [r1 * 3]
-
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
- movu [r0 + r1], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
- movu [r0 + r1 * 2], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
- movu [r0 + r4], m2
-
- lea r2, [r0 + r1 *4]
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
- movu [r2], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
- movu [r2 + r1], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
- movu [r2 + r1 * 2], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
- movu [r2 + r4], m2
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
-
- movh m1, [r3] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
-
-.quit:
- movu [r0], m0
- RET
-
-cglobal intra_pred_ang8_11, 3,5,7
- lea r3, [ang_table + 23 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r3 + 7 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 7 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 5 * 16] ; [28]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 3 * 16] ; [26]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 + 1 * 16] ; [24]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- mova m4, m3
- pmaddwd m4, [r3 - 1 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 3 * 16] ; [20]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 5 * 16] ; [18]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r3 - 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r3 - 7 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r3 - 7 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_12, 4,6,7
- lea r5, [ang_table + 16 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 11 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 + 6 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [17]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r5 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r5 - 4 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r5 - 4 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- punpckhwd m1, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m1, m2
- punpckhdq m1, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m1
- movhps [r2 + r4], m1
-
- mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [7]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 14 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_12]
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 13 * 16] ; [29]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r5 + 8 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 8 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_13, 4,6,8
- lea r5, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 9 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5] ; [14]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r5 - 9 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r5 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_13]
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 14 * 16] ; [28]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 14 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- punpckhwd m7, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m7, m2
- punpckhdq m7, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
-
- mova m4, m3
- pmaddwd m4, [r5 + 5 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 4 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 4 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 + 10 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 10 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_14, 4,6,8
- lea r5, [ang_table + 18 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 1 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 12 * 16] ; [6]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 - 12 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_14]
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 7 * 16] ; [25]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- mova m5, m3
- pmaddwd m5, [r5 - 6 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 - 6 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- punpckhwd m7, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m7, m2
- punpckhdq m7, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [31]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5] ; [18]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 + 6 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 6 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_15, 4,6,8
- lea r5, [ang_table + 20 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 5 * 16] ; [15]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_15]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 10 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 10 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 8 * 16] ; [28]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 8 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- punpckhwd m7, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m7, m2
- punpckhdq m7, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
-
- mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [26]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 6 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 11 * 16] ; [9]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
- pinsrw m3, [r3 + 16], 0
-
- pmaddwd m3, [r5 + 4 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 4 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_16, 4,6,8
- lea r5, [ang_table + 13 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 2 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 2 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_16]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 9 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 9 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 12 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 12 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 - 1 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 - 1 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- punpckhwd m7, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m7, m2
- punpckhdq m7, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 10 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 10 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 11 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
- pinsrw m3, [r3 + 16], 0
-
- pmaddwd m3, [r5 + 11 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 11 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_17, 4,6,8
- lea r5, [ang_table + 17 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 11 * 16] ; [6]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 11 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r3]
- pshufb m1, [pw_ang8_17]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 - 5 * 16] ; [12]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [18]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 1 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 7 * 16] ; [24]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 7 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- punpckhwd m7, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m5
- punpcklwd m6, m5
-
- punpckldq m5, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m7, m2
- punpckhdq m7, m2
-
- lea r4, [r1 * 3]
- movh [r0], m5
- movhps [r0 + r1], m5
- movh [r0 + r1 * 2], m4
- movhps [r0 + r4], m4
- lea r2, [r0 + r1 * 4]
- movh [r2], m6
- movhps [r2 + r1], m6
- movh [r2 + r1 * 2], m7
- movhps [r2 + r4], m7
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 13 * 16] ; [4]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [10]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 - 1 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 - 1 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- punpckhwd m5, m4, m2
- punpcklwd m4, m2
- punpckhwd m2, m6, m3
- punpcklwd m6, m3
-
- punpckldq m3, m4, m6
- punpckhdq m4, m6
- punpckldq m6, m5, m2
- punpckhdq m5, m2
-
- movh [r0 + 8], m3
- movhps [r0 + r1 + 8], m3
- movh [r0 + r1 * 2 + 8], m4
- movhps [r0 + r4 + 8], m4
- lea r0, [r0 + r1 * 4]
- movh [r0 + 8], m6
- movhps [r0 + r1 + 8], m6
- movh [r0 + r1 * 2 + 8], m5
- movhps [r0 + r4 + 8], m5
-
- RET
-
-cglobal intra_pred_ang8_18, 4,5,3
- add r1, r1
- lea r4, [r1 * 3]
- movu m1, [r3]
- movu m0, [r2 + 2]
- pshufb m0, [pw_swap16]
- movu [r0], m1
- palignr m2, m1, m0, 14
- movu [r0 + r1], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 10
- movu [r0 + r4], m2
- lea r0, [r0 + r1 * 4]
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2], m2
- palignr m1, m0, 2
- movu [r0 + r4], m1
- RET
-
-cglobal intra_pred_ang8_19, 4,6,8
- lea r5, [ang_table + 17 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 11 * 16] ; [6]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 11 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_17]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 - 5 * 16] ; [12]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [18]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 1 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 7 * 16] ; [24]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 7 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 13 * 16] ; [4]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [10]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 - 1 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 - 1 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_20, 4,6,8
- lea r5, [ang_table + 13 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 2 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 2 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_16]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 9 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 9 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 12 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 12 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 - 1 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 - 1 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 10 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 10 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 11 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
- pinsrw m3, [r2 + 16], 0
-
- pmaddwd m3, [r5 + 11 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 11 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_21, 4,6,8
- lea r5, [ang_table + 20 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 - 5 * 16] ; [15]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_15]
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 10 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 10 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 7 * 16] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 8 * 16] ; [28]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 8 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [26]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 + 6 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 11 * 16] ; [9]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 11 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
- pinsrw m3, [r2 + 16], 0
-
- pmaddwd m3, [r5 + 4 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 4 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_22, 4,6,8
- lea r5, [ang_table + 18 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 1 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 12 * 16] ; [6]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 - 12 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_14]
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 7 * 16] ; [25]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 7 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- mova m5, m3
- pmaddwd m5, [r5 - 6 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 - 6 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- mova m4, m3
- pmaddwd m4, [r5 + 13 * 16] ; [31]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5] ; [18]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 + 6 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 6 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_23, 4,6,8
- lea r5, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 9 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5] ; [14]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r5 - 9 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r5 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_13]
- palignr m3, m1, 12
-
- mova m5, m3
- pmaddwd m5, [r5 + 14 * 16] ; [28]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m7, m0
- pmaddwd m7, [r5 + 14 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m5, m7
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r5 + 5 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 4 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m5, m0
- pmaddwd m5, [r5 - 4 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m2, m5
-
- mova m6, m3
- pmaddwd m6, [r5 - 13 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 - 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pslldq m1, 2
- palignr m0, m3, 12
- palignr m3, m1, 12
-
- pmaddwd m3, [r5 + 10 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 10 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_24, 4,6,7
- lea r5, [ang_table + 16 * 16]
- add r1, r1
-
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r5 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 + 11 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 + 6 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 + 6 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r5 + 1 * 16] ; [17]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r5 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r5 - 4 * 16] ; [12]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r5 - 4 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r5 - 9 * 16] ; [7]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r5 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r5 - 14 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r5 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m0, m3, 12
- movu m1, [r2]
- pshufb m1, [pw_ang8_12]
- palignr m3, m1, 12
-
- mova m6, m3
- pmaddwd m6, [r5 + 13 * 16] ; [29]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r5 + 13 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r5 + 8 * 16] ; [24]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r5 + 8 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_25, 3,5,7
- mov r2, r3mp
- lea r3, [ang_table + 23 * 16]
- add r1, r1
-
- movu m0, [r2] ; [7 6 5 4 3 2 1 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
-
- punpcklwd m3, m0, m1 ; [4 3 3 2 2 1 1 0]
- punpckhwd m0, m1 ; [8 7 7 6 6 5 5 4]
-
- mova m4, m3
- pmaddwd m4, [r3 + 7 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 7 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 5 * 16] ; [28]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 3 * 16] ; [26]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 + 1 * 16] ; [24]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r3 - 1 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 3 * 16] ; [20]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 5 * 16] ; [18]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r3 - 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r3 - 7 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r3 - 7 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_26, 4,5,3
- movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- add r1, r1
- lea r4, [r1 * 3]
-
- movu [r0], m0
- movu [r0 + r1], m0
- movu [r0 + r1 * 2], m0
- movu [r0 + r4], m0
-
- lea r3, [r0 + r1 *4]
- movu [r3], m0
- movu [r3 + r1], m0
- movu [r3 + r1 * 2], m0
- movu [r3 + r4], m0
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
-
- pshufb m0, [pw_unpackwdq]
- movh m1, [r2] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- psubw m1, m2
- psraw m1, 1
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
- pextrw [r0], m0, 0
- pextrw [r0 + r1], m0, 1
- pextrw [r0 + r1 * 2], m0, 2
- pextrw [r0 + r4], m0, 3
- pextrw [r3], m0, 4
- pextrw [r3 + r1], m0, 5
- pextrw [r3 + r1 * 2], m0, 6
- pextrw [r3 + r4], m0, 7
-
-.quit:
- RET
-
-cglobal intra_pred_ang8_27, 3,5,7
- mov r2, r3mp
- lea r3, [ang_table + 9 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
-
- punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
-
- mova m4, m3
- pmaddwd m4, [r3 - 7 * 16] ; [2]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 7 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 5 * 16] ; [4]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 3 * 16] ; [6]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 - 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 - 1 * 16] ; [8]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 - 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r3 + 1 * 16] ; [10]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 3 * 16] ; [12]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 5 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m5, m0
- pmaddwd m5, [r3 + 5 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m6, m5
-
- pmaddwd m3, [r3 + 7 * 16] ; [16]
- paddd m3, [pd_16]
- psrld m3, 5
- pmaddwd m0, [r3 + 7 * 16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m3, m0
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m3
-
- RET
-
-cglobal intra_pred_ang8_28, 3,5,7
- mov r2, r3mp
- lea r3, [ang_table + 17 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 4] ; [9 8 7 6 5 4 3 2]
-
- punpcklwd m3, m0, m1 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m1 ; [9 8 8 7 7 6 6 5]
-
- mova m4, m3
- pmaddwd m4, [r3 - 12 * 16] ; [5]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 - 7 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 - 2 * 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 - 2 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- mova m5, m3
- pmaddwd m5, [r3 + 3 * 16] ; [20]
- paddd m5, [pd_16]
- psrld m5, 5
- mova m1, m0
- pmaddwd m1, [r3 + 3 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- mova m4, m3
- pmaddwd m4, [r3 + 8 * 16] ; [25]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 8 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 13 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 13 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- movh m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
-
- palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m5, m6
- pmaddwd m6, [r3 - 14 * 16] ; [3]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m1, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m3, m1
- pmaddwd m1, [r3 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m5, [r3 - 9 * 16] ; [8]
- paddd m5, [pd_16]
- psrld m5, 5
- pmaddwd m3, [r3 - 9 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m5, m3
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m5
-
- RET
-
-cglobal intra_pred_ang8_29, 3,5,8
- mov r2, r3mp
- lea r3, [ang_table + 18 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 - 9 * 16] ; [9]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 9 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3] ; [18]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- mova m6, m3
- pmaddwd m6, [r3 + 9 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m1, m0
- pmaddwd m1, [r3 + 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m7, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m7, [r3 - 14 * 16] ; [4]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 - 14 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- palignr m4, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m4
- pmaddwd m4, [r3 - 5 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m2
- pmaddwd m2, [r3 - 5 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pmaddwd m2, m6, [r3 + 4 * 16] ; [22]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m1, m7, [r3 + 4 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 13 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 13 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 - 10 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m5, [r3 - 10 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m7, m5
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- RET
-
-cglobal intra_pred_ang8_30, 3,5,8
- mov r2, r3mp
- lea r3, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 - 1 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 - 1 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m3
- pmaddwd m2, [r3 + 12 * 16] ; [26]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m1, m0
- pmaddwd m1, [r3 + 12 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- palignr m6, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m7, m6
- pmaddwd m6, [r3 - 7 * 16] ; [7]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m7, [r3 + 6 * 16] ; [20]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m1, [r3 + 6 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- mova m6, m4
- pmaddwd m4, [r3 - 13 * 16] ; [1]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- mova m7, m2
- pmaddwd m2, [r3 - 13 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- pmaddwd m2, m6, [r3] ; [14]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m1, m7, [r3]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 13 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 13 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- pmaddwd m7, [r3 - 6 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- pmaddwd m5, [r3 - 6 * 16]
- paddd m5, [pd_16]
- psrld m5, 5
- packusdw m7, m5
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- RET
-
-cglobal intra_pred_ang8_31, 3,5,8
- mov r2, r3mp
- lea r3, [ang_table + 13 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 + 4 * 16] ; [17]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 4 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m2
- pmaddwd m2, [r3 - 11 * 16] ; [2]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m1
- pmaddwd m1, [r3 - 11 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 6 * 16] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 6 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 - 9 * 16] ; [4]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m1, [r3 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- palignr m4, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m4, [r3 + 8 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m2, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m2, [r3 + 8 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- mova m6, m2
- pmaddwd m2, [r3 - 7 * 16] ; [6]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- mova m7, m1
- pmaddwd m1, [r3 - 7 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 10 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 10 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- mova m7, m0
- pmaddwd m7, [r3 - 5 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- mova m1, m5
- pmaddwd m1, [r3 - 5 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- RET
-
-cglobal intra_pred_ang8_32, 3,6,8
- mov r2, r3mp
- lea r3, [ang_table + 19 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
-
- mova m4, m3
- pmaddwd m4, [r3 + 2 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 2 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- mova m6, m2
- pmaddwd m2, [r3 - 9 * 16] ; [10]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m1, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- mova m7, m1
- pmaddwd m1, [r3 - 9 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m2, m1
-
- pmaddwd m6, [r3 + 12 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
- pmaddwd m7, [r3 + 12 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m7, [r3 + 1 * 16] ; [20]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m1, [r3 + 1 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- palignr m4, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- mova m2, m4
- pmaddwd m4, [r3 - 10 * 16] ; [ 9]
- paddd m4, [pd_16]
- psrld m4, 5
- palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- mova m6, m3
- pmaddwd m3, [r3 - 10 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m4, m3
-
- pmaddwd m2, [r3 + 11 * 16] ; [30]
- paddd m2, [pd_16]
- psrld m2, 5
- pmaddwd m6, [r3 + 11 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- mova m6, m0
- pmaddwd m6, [r3] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
- mova m7, m5
- pmaddwd m7, [r3]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- movh m1, [r2 + 26] ; [16 15 14 13]
- palignr m7, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m7, [r3 - 11 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, 4 ; [14 13 13 12 12 11 11 10]
- pmaddwd m1, [r3 - 11 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- RET
-
-cglobal intra_pred_ang8_33, 3,5,8
- mov r2, r3mp
- lea r3, [ang_table + 14 * 16]
- add r1, r1
-
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m1, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m2, m1, m0, 2 ; [9 8 7 6 5 4 3 2]
- psrldq m4, m1, 2 ; [x 16 15 14 13 12 11 10]
-
- punpcklwd m3, m0, m2 ; [5 4 4 3 3 2 2 1]
- punpckhwd m0, m2 ; [9 8 8 7 7 6 6 5]
- punpcklwd m5, m1, m4 ; [13 12 12 11 11 10 10 9]
- punpckhwd m1, m4 ; [x 16 16 15 15 14 14 13]
-
- mova m4, m3
- pmaddwd m4, [r3 + 12 * 16] ; [26]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m0
- pmaddwd m2, [r3 + 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- palignr m2, m0, m3, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m2, [r3 + 6 * 16] ; [20]
- paddd m2, [pd_16]
- psrld m2, 5
- palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m6, [r3 + 6 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- palignr m6, m0, m3, 8 ; [7 6 6 5 5 4 4 3]
- pmaddwd m6, [r3] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m7, [r3]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m0, m3, 12 ; [8 7 7 6 6 5 5 4]
- pmaddwd m7, [r3 - 6 * 16] ; [ 8]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m3, m5, m0, 12 ; [12 11 11 10 10 9 9 8]
- pmaddwd m3, [r3 - 6 * 16]
- paddd m3, [pd_16]
- psrld m3, 5
- packusdw m7, m3
-
- lea r4, [r1 * 3]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- mova m4, m0
- pmaddwd m4, [r3 - 12 * 16] ; [ 2]
- paddd m4, [pd_16]
- psrld m4, 5
- mova m2, m5
- pmaddwd m2, [r3 - 12 * 16]
- paddd m2, [pd_16]
- psrld m2, 5
- packusdw m4, m2
-
- mova m2, m0
- pmaddwd m2, [r3 + 14 * 16] ; [28]
- paddd m2, [pd_16]
- psrld m2, 5
- mova m6, m5
- pmaddwd m6, [r3 + 14 * 16]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m2, m6
-
- palignr m6, m5, m0, 4 ; [10 9 9 8 8 7 7 6]
- pmaddwd m6, [r3 + 8 * 16] ; [22]
- paddd m6, [pd_16]
- psrld m6, 5
- palignr m7, m1, m5, 4 ; [14 13 13 12 12 11 11 10]
- pmaddwd m7, [r3 + 8 * 16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- palignr m7, m5, m0, 8 ; [11 10 10 9 9 8 8 7]
- pmaddwd m7, [r3 + 2 * 16] ; [16]
- paddd m7, [pd_16]
- psrld m7, 5
- palignr m1, m5, 8 ; [15 14 14 13 13 12 12 11]
- pmaddwd m1, [r3 + 2 * 16]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m7, m1
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m4
- movu [r0 + r1], m2
- movu [r0 + r1 * 2], m6
- movu [r0 + r4], m7
-
- RET
-
;-----------------------------------------------------------------------------------------
; void intraPredAng8(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;-----------------------------------------------------------------------------------------
@@ -8568,89 +4914,6 @@
movu [r0 + r4], m7
RET
-;-----------------------------------------------------------------------------
-; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-----------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang16_2, 3,4,5
- cmp r4m, byte 34
- cmove r2, r3mp
- add r1, r1
- lea r3, [r1 * 3]
- movu m0, [r2 + 4]
- movu m1, [r2 + 20]
- movu m2, [r2 + 36]
-
- movu [r0], m0
- movu [r0 + 16], m1
- palignr m3, m1, m0, 2
- palignr m4, m2, m1, 2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m1, m0, 4
- palignr m4, m2, m1, 4
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m1, m0, 6
- palignr m4, m2, m1, 6
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- lea r0, [r0 + r1 * 4]
- palignr m3, m1, m0, 8
- palignr m4, m2, m1, 8
- movu [r0], m3
- movu [r0 + 16], m4
- palignr m3, m1, m0, 10
- palignr m4, m2, m1, 10
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m1, m0, 12
- palignr m4, m2, m1, 12
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m1, m0, 14
- palignr m4, m2, m1, 14
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- movu m0, [r2 + 52]
- lea r0, [r0 + r1 * 4]
- movu [r0], m1
- movu [r0 + 16], m2
- palignr m3, m2, m1, 2
- palignr m4, m0, m2, 2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m2, m1, 4
- palignr m4, m0, m2, 4
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m2, m1, 6
- palignr m4, m0, m2, 6
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- lea r0, [r0 + r1 * 4]
- palignr m3, m2, m1, 8
- palignr m4, m0, m2, 8
- movu [r0], m3
- movu [r0 + 16], m4
- palignr m3, m2, m1, 10
- palignr m4, m0, m2, 10
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- palignr m3, m2, m1, 12
- palignr m4, m0, m2, 12
- movu [r0 + r1 * 2], m3
- movu [r0 + r1 * 2 + 16], m4
- palignr m3, m2, m1, 14
- palignr m4, m0, m2, 14
- movu [r0 + r3], m3
- movu [r0 + r3 + 16], m4
-
- RET
-
%macro TRANSPOSE_STORE 6
jnz .skip%6
punpckhwd %5, %1, %2
@@ -8870,38 +5133,6 @@
ret
-cglobal intra_pred_ang16_3, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_3_33
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_3_33
-
- RET
-
-cglobal intra_pred_ang16_33, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_3_33
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_3_33
-
- RET
-
cglobal ang16_mode_4_32
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -9098,38 +5329,6 @@
ret
-cglobal intra_pred_ang16_4, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 18 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_4_32
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_4_32
-
- RET
-
-cglobal intra_pred_ang16_32, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 18 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_4_32
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_4_32
-
- RET
-
cglobal ang16_mode_5_31
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -9322,38 +5521,6 @@
ret
-cglobal intra_pred_ang16_5, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_5_31
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_5_31
-
- RET
-
-cglobal intra_pred_ang16_31, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_5_31
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_5_31
-
- RET
-
cglobal ang16_mode_6_30
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -9547,38 +5714,6 @@
ret
-cglobal intra_pred_ang16_6, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_6_30
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_6_30
-
- RET
-
-cglobal intra_pred_ang16_30, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_6_30
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_6_30
-
- RET
-
cglobal ang16_mode_7_29
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -9766,38 +5901,6 @@
ret
-cglobal intra_pred_ang16_7, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 17 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_7_29
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_7_29
-
- RET
-
-cglobal intra_pred_ang16_29, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 17 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_7_29
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_7_29
-
- RET
-
cglobal ang16_mode_8_28
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -9987,38 +6090,6 @@
ret
-cglobal intra_pred_ang16_8, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_8_28
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_8_28
-
- RET
-
-cglobal intra_pred_ang16_28, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 15 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_8_28
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_8_28
-
- RET
-
cglobal ang16_mode_9_27
test r6d, r6d
movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
@@ -10195,38 +6266,6 @@
ret
-cglobal intra_pred_ang16_9, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_9_27
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_9_27
-
- RET
-
-cglobal intra_pred_ang16_27, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_9_27
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_9_27
-
- RET
-
cglobal ang16_mode_11_25
test r6d, r6d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -10405,38 +6444,6 @@
ret
-cglobal intra_pred_ang16_11, 3,7,8
- xor r6d, r6d
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_11_25
-
- lea r2, [r2 + 16]
- lea r0, [r0 + r1 * 8]
-
- call ang16_mode_11_25
-
- RET
-
-cglobal intra_pred_ang16_25, 4,7,8
- xor r6d, r6d
- inc r6d
- mov r2, r3
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r4, [r1 * 3]
-
- call ang16_mode_11_25
-
- lea r2, [r2 + 16]
- lea r0, [r0 + 16]
-
- call ang16_mode_11_25
-
- RET
-
cglobal ang16_mode_12_24
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -10628,46 +6635,6 @@
ret
-cglobal intra_pred_ang16_12, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 16 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang8_12]
- pinsrw m5, [r3 + 26], 5
- xor r3d, r3d
-
- call ang16_mode_12_24
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_12_24
-
- RET
-
-cglobal intra_pred_ang16_24, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 16 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang8_12]
- pinsrw m5, [r3 + 26], 5
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_12_24
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_12_24
-
- RET
-
cglobal ang16_mode_13_23
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -10867,52 +6834,6 @@
ret
-cglobal intra_pred_ang16_13, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang16_13]
- movu m6, [r3 + 14]
- pshufb m6, [pw_ang8_13]
- pslldq m6, 2
- palignr m5, m6, 6
- xor r3d, r3d
-
- call ang16_mode_13_23
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_13_23
-
- RET
-
-cglobal intra_pred_ang16_23, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m5, [r3]
- pshufb m5, [pw_ang16_13]
- movu m6, [r3 + 14]
- pshufb m6, [pw_ang8_13]
- pslldq m6, 2
- palignr m5, m6, 6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_13_23
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_13_23
-
- RET
-
cglobal ang16_mode_14_22
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -11120,50 +7041,6 @@
ret
-cglobal intra_pred_ang16_14, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 18 * 16]
- movu m6, [r3]
- pshufb m6, [pw_ang8_14]
- movu m5, [r3 + 20]
- pshufb m5, [pw_ang8_14]
- punpckhqdq m5, m6
- xor r3d, r3d
-
- call ang16_mode_14_22
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_14_22
-
- RET
-
-cglobal intra_pred_ang16_22, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 18 * 16]
- movu m6, [r3]
- pshufb m6, [pw_ang8_14]
- movu m5, [r3 + 20]
- pshufb m5, [pw_ang8_14]
- punpckhqdq m5, m6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_14_22
-
- lea r0, [r0 + 16]
- movu m5, [r2 + 2]
- lea r2, [r2 + 16]
-
- call ang16_mode_14_22
-
- RET
-
cglobal ang16_mode_15_21
test r3d, r3d
movu m0, [r2] ; [7 6 5 4 3 2 1 0]
@@ -11380,50 +7257,6 @@
ret
-cglobal intra_pred_ang16_15, 4,7,8
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang8_15]
- movu m5, [r3 + 18]
- pshufb m5, [pw_ang8_15]
- punpckhqdq m5, m6
- xor r3d, r3d
-
- call ang16_mode_15_21
-
- lea r0, [r0 + r1 * 8]
- movu m5, [r2]
- lea r2, [r2 + 16]
-
- call ang16_mode_15_21
-
- RET
-
-cglobal intra_pred_ang16_21, 4,7,8
- xchg r2, r3
- add r1, r1
- lea r4, [r1 * 3]
- lea r6, [ang_table + 15 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang8_15]
- movu m5, [r3 + 18]
- pshufb m5, [pw_ang8_15]
- punpckhqdq m5, m6
- xor r3d, r3d
- inc r3d
-
- call ang16_mode_15_21
-
- lea r0, [r0 + 16]
- movu m5, [r2]
- lea r2, [r2 + 16]
-
- call ang16_mode_15_21
-
- RET
-
cglobal ang16_mode_16_20
test r4d, r4d
lea r4, [r1 * 3]
@@ -11651,57 +7484,6 @@
ret
-cglobal intra_pred_ang16_16, 4,7,8,0-(1*mmsize)
- add r1, r1
- lea r6, [ang_table + 13 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 16]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 24]
- xor r4, r4
-
- call ang16_mode_16_20
-
- lea r0, [r0 + r1 * 8]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
-
- call ang16_mode_16_20
-
- RET
-
-cglobal intra_pred_ang16_20, 4,7,8,0-(1*mmsize)
- xchg r2, r3
- add r1, r1
- lea r6, [ang_table + 13 * 16]
- movu m6, [r3 + 4]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 16]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 24]
- xor r4, r4
- inc r4
-
- call ang16_mode_16_20
-
- lea r0, [r0 + 16]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
- inc r4
-
- call ang16_mode_16_20
-
- RET
-
cglobal ang16_mode_17_19
test r4d, r4d
lea r4, [r1 * 3]
@@ -11937,323 +7719,6 @@
ret
-cglobal intra_pred_ang16_17, 4,7,8,0-(1*mmsize)
- add r1, r1
- lea r6, [ang_table + 16 * 16]
- movu m6, [r3 + 2]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 12]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 20]
- xor r4, r4
-
- call ang16_mode_17_19
-
- lea r0, [r0 + r1 * 8]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
-
- call ang16_mode_17_19
-
- RET
-
-cglobal intra_pred_ang16_19, 4,7,8,0-(1*mmsize)
- xchg r2, r3
- add r1, r1
- lea r6, [ang_table + 16 * 16]
- movu m6, [r3 + 2]
- pshufb m6, [pw_ang16_16]
- movu m5, [r3 + 12]
- pshufb m5, [pw_ang16_16]
- punpckhqdq m5, m6
- mov [rsp], r3
- lea r3, [r3 + 20]
- xor r4, r4
- inc r4
-
- call ang16_mode_17_19
-
- lea r0, [r0 + 16]
- mov r3, [rsp]
- movu m5, [r2]
- lea r2, [r2 + 16]
- xor r4, r4
- inc r4
-
- call ang16_mode_17_19
-
- RET
-
-cglobal intra_pred_ang16_18, 4,5,4
- add r1, r1
- lea r4, [r1 * 3]
- movu m1, [r3]
- movu m3, [r3 + 16]
- movu m0, [r2 + 2]
- pshufb m0, [pw_swap16]
- movu [r0], m1
- movu [r0 + 16], m3
- palignr m2, m1, m0, 14
- movu [r0 + r1], m2
- palignr m2, m3, m1, 14
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 12
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 10
- movu [r0 + r4], m2
- palignr m2, m3, m1, 10
- movu [r0 + r4 + 16], m2
-
- lea r0, [r0 + r1 * 4]
- palignr m2, m1, m0, 8
- movu [r0], m2
- palignr m2, m3, m1, 8
- movu [r0 + 16], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1], m2
- palignr m2, m3, m1, 6
- movu [r0 + r1 + 16], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2], m2
- palignr m2, m3, m1, 4
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m1, m0, 2
- movu [r0 + r4], m2
- palignr m3, m1, 2
- movu [r0 + r4 + 16], m3
-
- lea r0, [r0 + r1 * 4]
- movu [r0], m0
- movu [r0 + 16], m1
- movu m3, [r2 + 18]
- pshufb m3, [pw_swap16]
- palignr m2, m0, m3, 14
- movu [r0 + r1], m2
- palignr m2, m1, m0, 14
- movu [r0 + r1 + 16], m2
- palignr m2, m0, m3, 12
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 12
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m0, m3, 10
- movu [r0 + r4], m2
- palignr m2, m1, m0, 10
- movu [r0 + r4 + 16], m2
-
- lea r0, [r0 + r1 * 4]
- palignr m2, m0, m3, 8
- movu [r0], m2
- palignr m2, m1, m0, 8
- movu [r0 + 16], m2
- palignr m2, m0, m3, 6
- movu [r0 + r1], m2
- palignr m2, m1, m0, 6
- movu [r0 + r1 + 16], m2
- palignr m2, m0, m3, 4
- movu [r0 + r1 * 2], m2
- palignr m2, m1, m0, 4
- movu [r0 + r1 * 2 + 16], m2
- palignr m2, m0, m3, 2
- movu [r0 + r4], m2
- palignr m1, m0, 2
- movu [r0 + r4 + 16], m1
-
- RET
-
-cglobal intra_pred_ang16_10, 4,5,4
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- pshufb m0, m1, [pw_unpackwdq] ; [1 1 1 1 1 1 1 1]
- add r1, r1
- lea r4, [r1 * 3]
-
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [2 2 2 2 2 2 2 2]
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [3 3 3 3 3 3 3 3]
- movu [r0 + r1 * 2], m2
- movu [r0 + r1 * 2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [4 4 4 4 4 4 4 4]
- movu [r0 + r4], m2
- movu [r0 + r4 + 16], m2
-
- lea r2, [r0 + r1 *4]
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [5 5 5 5 5 5 5 5]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [6 6 6 6 6 6 6 6]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [7 7 7 7 7 7 7 7]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m1, 2
- pshufb m2, m1, [pw_unpackwdq] ; [8 8 8 8 8 8 8 8]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
-
- lea r2, [r2 + r1 *4]
- pshufb m2, m3, [pw_unpackwdq] ; [9 9 9 9 9 9 9 9]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [10 10 10 10 10 10 10 10]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [11 11 11 11 11 11 11 11]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [12 12 12 12 12 12 12 12]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
-
- lea r2, [r2 + r1 *4]
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [13 13 13 13 13 13 13 13]
- movu [r2], m2
- movu [r2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [14 14 14 14 14 14 14 14]
- movu [r2 + r1], m2
- movu [r2 + r1 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [15 15 15 15 15 15 15 15]
- movu [r2 + r1 * 2], m2
- movu [r2 + r1 * 2 + 16], m2
- psrldq m3, 2
- pshufb m2, m3, [pw_unpackwdq] ; [16 16 16 16 16 16 16 16]
- movu [r2 + r4], m2
- movu [r2 + r4 + 16], m2
- mova m3, m0
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
-
- movh m1, [r3] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
- psubw m1, m2
- psubw m3, m2
- psraw m1, 1
- psraw m3, 1
- paddw m3, m0
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
- pmaxsw m3, m1
- pminsw m3, [pw_1023]
-.quit:
- movu [r0], m0
- movu [r0 + 16], m3
-
- RET
-
-cglobal intra_pred_ang16_26, 4,5,4
- movu m0, [r3 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r3 + 18] ; [16 15 14 13 12 11 10 9]
- add r1, r1
- lea r4, [r1 * 3]
-
- movu [r0], m0
- movu [r0 + 16], m3
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m3
- movu [r0 + r1 * 2], m0
- movu [r0 + r1 * 2 + 16], m3
- movu [r0 + r4], m0
- movu [r0 + r4 + 16], m3
-
- lea r3, [r0 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
-
- lea r3, [r3 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
-
- lea r3, [r3 + r1 *4]
- movu [r3], m0
- movu [r3 + 16], m3
- movu [r3 + r1], m0
- movu [r3 + r1 + 16], m3
- movu [r3 + r1 * 2], m0
- movu [r3 + r1 * 2 + 16], m3
- movu [r3 + r4], m0
- movu [r3 + r4 + 16], m3
-
- cmp r5m, byte 0
- jz .quit
-
- ; filter
-
- pshufb m0, [pw_unpackwdq]
- movh m1, [r2] ; [3 2 1 0]
- pshufb m2, m1, [pw_unpackwdq] ; [0 0 0 0 0 0 0 0]
- movu m1, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- psubw m1, m2
- psubw m3, m2
- psraw m1, 1
- psraw m3, 1
- paddw m3, m0
- paddw m0, m1
- pxor m1, m1
- pmaxsw m0, m1
- pminsw m0, [pw_1023]
- pmaxsw m3, m1
- pminsw m3, [pw_1023]
- pextrw [r0], m0, 0
- pextrw [r0 + r1], m0, 1
- pextrw [r0 + r1 * 2], m0, 2
- pextrw [r0 + r4], m0, 3
- lea r0, [r0 + r1 * 4]
- pextrw [r0], m0, 4
- pextrw [r0 + r1], m0, 5
- pextrw [r0 + r1 * 2], m0, 6
- pextrw [r0 + r4], m0, 7
- lea r0, [r0 + r1 * 4]
- pextrw [r0], m3, 0
- pextrw [r0 + r1], m3, 1
- pextrw [r0 + r1 * 2], m3, 2
- pextrw [r0 + r4], m3, 3
- pextrw [r3], m3, 4
- pextrw [r3 + r1], m3, 5
- pextrw [r3 + r1 * 2], m3, 6
- pextrw [r3 + r4], m3, 7
-
-.quit:
- RET
-
;------------------------------------------------------------------------------------------
; void intraPredAng16(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
;------------------------------------------------------------------------------------------
@@ -13344,25 +8809,6 @@
movu [r0 + r4 + 48], m5
lea r0, [r0 + r1 * 4]
%endmacro
-;--------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_2_34(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;--------------------------------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2, 3,6,6
- cmp r4m, byte 34
- cmove r2, r3mp
-
- add r1, r1
- lea r3, [r1 * 2]
- lea r4, [r1 * 3]
- mov r5, 2
-
-.loop:
- MODE_2_34
- add r2, 32
- dec r5
- jnz .loop
- RET
%macro TRANSPOSE_STORE_8x8 6
%if %2 == 1
@@ -13600,11 +9046,2402 @@
TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
+
+%macro MODE_4_32 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
+ pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 15 * 16] ; [31]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m2, m0, 8
+ pmaddwd m6, [r3 + 4 * 16] ; [ 20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m1, m2, m0, 12
+ pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
+ pmaddwd m1, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m5, m2, m7, 8
+ mova m6, m5
+ pmaddwd m5, [r3 - 9 * 16] ; [07]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m6, m2, m7, 12
+ pmaddwd m6, [r3 + 16] ; [17]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m7, m3, m2, 4
+ pmaddwd m7, [r3] ; [16]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+ mova m7, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ palignr m0, m3, m2, 8
+ pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m5, m3, m2, 12
+ pmaddwd m5, [r3 - 16] ; [15]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
+ palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
+ palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
+ punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
+ punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
+
+ palignr m1, m2, m3, 4
+ pmaddwd m1, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m1, m2, m3, 8
+ mova m0, m1
+ pmaddwd m1, [r3 - 13 * 16] ; [3]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ palignr m4, m2, m3, 12
+ pmaddwd m4, [r3 - 3 * 16] ; [13]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m7, m2, 4
+ pmaddwd m6, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m1, m7, m2, 8
+ pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m1, m7, m2, 12
+ pmaddwd m1, [r3 - 5 * 16] ; [11]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m1, m1
+ movhps m1, [r2 + 44] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_5_31 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 + 16] ; [17]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m1, m2, m0, 4
+ mova m5, m1
+ pmaddwd m1, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 3 * 16] ; [19]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m2, m0, 8
+ mova m1, m6
+ pmaddwd m6, [r3 - 12 * 16] ; [4]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m1, m2, m0, 12
+ mova m7, m1
+ pmaddwd m7, [r3 - 10 * 16] ; [6]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m6, m7
+
+ pmaddwd m1, [r3 + 7 * 16] ; [23]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
+ pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m6, m2, m7, 4
+ pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ pmaddwd m6, [r3 + 13 * 16] ; [29]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m1, m2, m7, 8
+ mova m0, m1
+ pmaddwd m1, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m0, m2, m7, 12
+ pmaddwd m0, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m1, m3, m2, 4
+ pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m1, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ palignr m1, m3, m2, 8
+ pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m7, m3, m2, 12
+ pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m7, [r3 + 8 * 16] ; [24]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m1, m7
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
+ palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
+ punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
+
+ palignr m1, m0, m3, 4
+ pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m1, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ palignr m1, m0, m3, 8
+ pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m1, m0, m3, 12
+ pmaddwd m1, [r3 - 16] ; [15]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m1, m1
+ movhps m1, [r2 + 36] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_6_30 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m1, m2, m0, 4
+ pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m1, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ palignr m1, m2, m0, 8
+ pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m6, m7
+
+ pmaddwd m1, [r3 + 11 * 16] ; [27]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ palignr m7, m2, m0, 12
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m2, [r3 - 16] ; [15]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m7, m3, m2, 4
+ pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
+ palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
+ palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
+ punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
+ punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
+
+ palignr m0, m2, m7, 4
+ pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ palignr m4, m2, m7, 4
+ pmaddwd m4, [r3 + 13 * 16] ; [29]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ palignr m5, m2, m7, 8
+ pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 7 * 16] ; [23]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m1, m2, m7, 12
+ pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m1, [r3 + 16] ; [17]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ palignr m5, m3, m2, 4
+ pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, [r3 + 15 * 16] ; [31]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m6, m3, m2, 8
+ pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m5, m1
+
+ pmaddwd m6, [r3 + 9 * 16] ; [25]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m1, m3, m2, 12
+ pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
+
+ pmaddwd m1, [r3 + 3 * 16] ; [19]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m1, m1
+ movhps m1, [r2 + 28] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_7_29 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m1, m2, m0, 4
+ pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m6, m7
+
+ pmaddwd m1, [r3 + 15 * 16] ; [31]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ mova m3, m0
+ palignr m7, m2, m0, 8
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 16] ; [17]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ palignr m1, m2, m3, 12
+ pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m2, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
+ punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
+
+ palignr m6, m7, m2, 4
+ pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, [r3 + 13 * 16] ; [29]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ palignr m0, m7, m2, 8
+ pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m0, [r3 - 16] ; [15]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ palignr m0, m7, m2, 12
+ pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
+
+ pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m1, m1
+ movhps m1, [r2 + 20] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_8_28 1
+ movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
+ palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m0, [r3 - 16] ; [15]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ palignr m7, m2, m0, 4
+ pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ mova m3, m0
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ palignr m7, m2, m3, 8
+ pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m7, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ palignr m7, m2, m3, 12
+ pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m2, [r3 + 16] ; [17]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
+
+ pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m1, m1
+ movhps m1, [r2 + 12] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_9_27 1
+ movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
+ palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
+ punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
+ punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
+
+ pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2 + 4] ; [00]
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ palignr m7, m2, m3, 4
+ pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m7, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m6, m0
+
+ pmaddwd m7, [r3 + 14 * 16] ; [30]
+ paddd m7, [pd_16]
+ psrld m7, 5
+ packusdw m7, m7
+ movhps m7, [r2 + 6] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
+%endmacro
+
+%macro MODE_11_25 1
+ movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2 + 2] ; [00]
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ movu m3, [r2] ; [6 5 4 3 2 1 0 16]
+ pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
+
+ pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_12_24 1
+ movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 16] ; [17]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 16] ; [15]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_13_23 1
+ movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 16] ; [15]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 + 16] ; [17]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_14_22 1
+ movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 + 16] ; [17]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 - 16] ; [15]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_15_21 1
+ movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 - 16] ; [15]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 30]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 28]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 24]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 16] ; [17]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_16_20 1
+ movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 38]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 36]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 34]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 32]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 30]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 28]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 24]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 16] ; [15]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 16] ; [17]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+%macro MODE_17_19 1
+ movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
+ pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
+
+ pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 48]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 46]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 44]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 42]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 40]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 38]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 36]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 34]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 32]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 30]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 28]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 26]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2 + 26] ; [00]
+
+ TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 24]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 22]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ movu m3, [r2 + 20]
+ pshufb m3, m2
+
+ pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 18]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m5, m0
+
+ movu m3, [r2 + 16]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2 + 14]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ movu m3, [r2 + 12]
+ pshufb m3, m2
+
+ pmaddwd m0, m3, [r3] ; [16]
+ paddd m0, [pd_16]
+ psrld m0, 5
+ packusdw m1, m0
+
+ TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
+
+ movu m3, [r2 + 10]
+ pshufb m3, m2
+
+ pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
+ paddd m4, [pd_16]
+ psrld m4, 5
+
+ movu m3, [r2 + 8]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m4, m1
+
+ pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
+ paddd m5, [pd_16]
+ psrld m5, 5
+
+ movu m3, [r2 + 6]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
+ paddd m6, [pd_16]
+ psrld m6, 5
+ packusdw m5, m6
+
+ movu m3, [r2 + 4]
+ pshufb m3, m2
+
+ pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
+ paddd m6, [pd_16]
+ psrld m6, 5
+
+ movu m3, [r2 + 2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
+ paddd m1, [pd_16]
+ psrld m1, 5
+ packusdw m6, m1
+
+ movu m3, [r2]
+ pshufb m3, m2
+
+ pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
+ paddd m1, [pd_16]
+ psrld m1, 5
+
+ packusdw m1, m1
+ movhps m1, [r2] ; [00]
+
+ TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
+%endmacro
+
+;------------------------------------------------------------------------------------------
+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
+;------------------------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal intra_pred_ang32_2_new, 3,6,6
+ lea r4, [r2]
+ add r2, 128
+ cmp r3m, byte 34
+ cmove r2, r4
+
+ add r1, r1
+ lea r3, [r1 * 2]
+ lea r4, [r1 * 3]
+ mov r5, 2
+
+.loop:
+ MODE_2_34
+ add r2, 32
+ dec r5
+ jnz .loop
+ RET
+
INIT_XMM sse4
-cglobal intra_pred_ang32_3, 3,6,8
+cglobal intra_pred_ang32_3_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -13618,199 +11455,9 @@
jnz .loop
RET
-%macro MODE_4_32 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m0, [r3 + 5 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
-
- palignr m5, m2, m0, 4 ; [6 5 5 4 4 3 3 2]
- pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, [r3 + 15 * 16] ; [31]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m6, m2, m0, 8
- pmaddwd m6, [r3 + 4 * 16] ; [ 20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- palignr m1, m2, m0, 12
- pmaddwd m6, m1, [r3 - 7 * 16] ; [ 9]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m2, [r3 + 3 * 16] ; [19]
- paddd m1, [pd_16]
- psrld m1, 5
-
- palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
-
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m1, m2, m7, 4 ; [11 10 10 9 9 8 7 6]
- pmaddwd m1, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- palignr m5, m2, m7, 8
- mova m6, m5
- pmaddwd m5, [r3 - 9 * 16] ; [07]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- palignr m6, m2, m7, 12
- pmaddwd m6, [r3 + 16] ; [17]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m2, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
- paddd m1, [pd_16]
- psrld m1, 5
-
- palignr m7, m3, m2, 4
- pmaddwd m7, [r3] ; [16]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
- mova m7, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- palignr m0, m3, m2, 8
- pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- palignr m5, m3, m2, 12
- pmaddwd m5, [r3 - 16] ; [15]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- pmaddwd m6, m3, [r3 + 9 * 16] ; [25]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m0, [r2 + 50] ; [32 31 30 29 28 27 26 25]
- palignr m2, m0, m7, 2 ; [25 24 23 22 21 20 19 18]
- palignr m1, m0, m7, 4 ; [26 25 24 23 22 21 20 19]
- punpckhwd m7, m2, m1 ; [26 25 25 24 24 23 23 22]
- punpcklwd m2, m1 ; [22 21 21 20 20 19 19 18]
-
- palignr m1, m2, m3, 4
- pmaddwd m1, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m1, m2, m3, 8
- mova m0, m1
- pmaddwd m1, [r3 - 13 * 16] ; [3]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- palignr m4, m2, m3, 12
- pmaddwd m4, [r3 - 3 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m2, [r3 + 7 * 16] ; [23]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m6, m7, m2, 4
- pmaddwd m6, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- palignr m1, m7, m2, 8
- pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m1, m7, m2, 12
- pmaddwd m1, [r3 - 5 * 16] ; [11]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 44] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_4(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_4, 3,6,8
+cglobal intra_pred_ang32_4_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -13824,194 +11471,9 @@
jnz .loop
RET
-%macro MODE_5_31 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m0, [r3 + 16] ; [17]
- paddd m4, [pd_16]
- psrld m4, 5
-
- palignr m1, m2, m0, 4
- mova m5, m1
- pmaddwd m1, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, [r3 + 3 * 16] ; [19]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m6, m2, m0, 8
- mova m1, m6
- pmaddwd m6, [r3 - 12 * 16] ; [4]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
- paddd m6, [pd_16]
- psrld m6, 5
-
- palignr m1, m2, m0, 12
- mova m7, m1
- pmaddwd m7, [r3 - 10 * 16] ; [6]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- pmaddwd m1, [r3 + 7 * 16] ; [23]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m7, m2, [r3 - 8 * 16] ; [8]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
- paddd m4, [pd_16]
- psrld m4, 5
-
- palignr m7, m3, m2, 4 ; [10 9 9 8 7 6 5 4]
- pmaddwd m1, m7, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m7, [r3 + 11 * 16] ; [27]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m6, m2, m7, 4
- pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- pmaddwd m6, [r3 + 13 * 16] ; [29]
- paddd m6, [pd_16]
- psrld m6, 5
-
- palignr m1, m2, m7, 8
- mova m0, m1
- pmaddwd m1, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m0, [r3 + 15 * 16] ; [31]
- paddd m1, [pd_16]
- psrld m1, 5
-
- palignr m0, m2, m7, 12
- pmaddwd m0, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m2, [r3 - 15 * 16] ; [1]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m2, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- palignr m1, m3, m2, 4
- pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m1, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- palignr m1, m3, m2, 8
- pmaddwd m6, m1, [r3 - 11 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m7, m3, m2, 12
- pmaddwd m1, m7, [r3 - 9 * 16] ; [7]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m7, [r3 + 8 * 16] ; [24]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m1, m7
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 7 * 16] ; [9]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m0, [r2 + 36] ; [25 24 23 22 21 20 19 18]
- palignr m1, m0, 2 ; [x 25 24 23 22 21 20 19]
- punpcklwd m0, m1 ; [22 21 21 20 20 19 19 18]
-
- palignr m1, m0, m3, 4
- pmaddwd m5, m1, [r3 - 5 * 16] ; [11]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m1, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- palignr m1, m0, m3, 8
- pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m1, m0, m3, 12
- pmaddwd m1, [r3 - 16] ; [15]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 36] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_5(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_5, 3,6,8
+cglobal intra_pred_ang32_5_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -14025,184 +11487,9 @@
jnz .loop
RET
-%macro MODE_6_30 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m0, [r3 - 3 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m0, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- palignr m1, m2, m0, 4
- pmaddwd m5, m1, [r3 - 9 * 16] ; [7]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m1, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- palignr m1, m2, m0, 8
- pmaddwd m6, m1, [r3 - 15 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m7, m1, [r3 - 2 * 16] ; [14]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- pmaddwd m1, [r3 + 11 * 16] ; [27]
- paddd m1, [pd_16]
- psrld m1, 5
-
- palignr m7, m2, m0, 12
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m2, [r3 - 16] ; [15]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m2, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- palignr m7, m3, m2, 4
- pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m7, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m0, [r2 + 34] ; [24 23 22 21 20 19 18 17]
- palignr m2, m0, m3, 2 ; [17 16 15 14 13 12 11 10]
- palignr m1, m0, m3, 4 ; [18 17 16 15 14 13 12 11]
- punpckhwd m3, m2, m1 ; [18 17 17 16 16 15 15 14]
- punpcklwd m2, m1 ; [14 13 13 12 12 11 11 10]
-
- palignr m0, m2, m7, 4
- pmaddwd m1, m0, [r3 - 13 * 16] ; [3]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- palignr m4, m2, m7, 4
- pmaddwd m4, [r3 + 13 * 16] ; [29]
- paddd m4, [pd_16]
- psrld m4, 5
-
- palignr m5, m2, m7, 8
- pmaddwd m1, m5, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, [r3 + 7 * 16] ; [23]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m1, m2, m7, 12
- pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m1, [r3 + 16] ; [17]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m2, [r3 - 5 * 16] ; [11]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m2, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- palignr m5, m3, m2, 4
- pmaddwd m4, m5, [r3 - 11 * 16] ; [5]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m5, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, [r3 + 15 * 16] ; [31]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m6, m3, m2, 8
- pmaddwd m1, m6, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m5, m1
-
- pmaddwd m6, [r3 + 9 * 16] ; [25]
- paddd m6, [pd_16]
- psrld m6, 5
-
- palignr m1, m3, m2, 12
- pmaddwd m0, m1, [r3 - 10 * 16] ; [6]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
-
- pmaddwd m1, [r3 + 3 * 16] ; [19]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 28] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_6(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_6, 3,6,8
+cglobal intra_pred_ang32_6_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -14216,178 +11503,9 @@
jnz .loop
RET
-%macro MODE_7_29 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m0, [r3 - 7 * 16] ; [9]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m0, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m0, [r3 + 11 * 16] ; [27]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m1, m2, m0, 4
- pmaddwd m6, m1, [r3 - 12 * 16] ; [4]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m1, [r3 - 3 * 16] ; [13]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m7, m1, [r3 + 6 * 16] ; [22]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m6, m7
-
- pmaddwd m1, [r3 + 15 * 16] ; [31]
- paddd m1, [pd_16]
- psrld m1, 5
-
- mova m3, m0
- palignr m7, m2, m0, 8
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 16] ; [17]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- palignr m1, m2, m3, 12
- pmaddwd m5, m1, [r3 - 13 * 16] ; [3]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m1, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m1, [r3 + 5 * 16] ; [21]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m2, [r3 - 9 * 16] ; [7]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m2, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m2, [r3 + 9 * 16] ; [25]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m7, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m7, 2 ; [x 16 15 14 13 12 11 10]
- punpcklwd m7, m1 ; [13 12 12 11 11 10 10 9]
-
- palignr m6, m7, m2, 4
- pmaddwd m1, m6, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m6, [r3 - 5 * 16] ; [11]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m6, [r3 + 4 * 16] ; [20]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, [r3 + 13 * 16] ; [29]
- paddd m6, [pd_16]
- psrld m6, 5
-
- palignr m0, m7, m2, 8
- pmaddwd m1, m0, [r3 - 10 * 16] ; [6]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m0, [r3 - 16] ; [15]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- palignr m0, m7, m2, 12
- pmaddwd m4, m0, [r3 - 15 * 16] ; [1]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m0, [r3 + 3 * 16] ; [19]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m7, [r3 - 11 * 16] ; [5]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m0, m7, [r3 - 2 * 16] ; [14]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
-
- pmaddwd m1, m7, [r3 + 7 * 16] ; [23]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 20] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_7(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_7, 3,6,8
+cglobal intra_pred_ang32_7_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -14401,171 +11519,9 @@
jnz .loop
RET
-%macro MODE_8_28 1
- movu m0, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movd m3, [r2 + 18] ; [16 15 14 13 12 11 10 9]
- palignr m1, m3, m0, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m0, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m0, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m0, [r3 - 11 * 16] ; [5]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m0, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m0, [r3 - 16] ; [15]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m0, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m0, [r3 + 9 * 16] ; [25]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m0, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- palignr m7, m2, m0, 4
- pmaddwd m1, m7, [r3 - 13 * 16] ; [3]
- paddd m1, [pd_16]
- psrld m1, 5
-
- mova m3, m0
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 - 3 * 16] ; [13]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m7, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m7, [r3 + 7 * 16] ; [23]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m7, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- palignr m7, m2, m3, 8
- pmaddwd m6, m7, [r3 - 15 * 16] ; [1]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m7, [r3 - 10 * 16] ; [6]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m7, [r3 - 5 * 16] ; [11]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m7, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 5 * 16] ; [21]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m7, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m7, [r3 + 15 * 16] ; [31]
- paddd m5, [pd_16]
- psrld m5, 5
-
- palignr m7, m2, m3, 12
- pmaddwd m0, m7, [r3 - 12 * 16] ; [4]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m7, [r3 - 7 * 16] ; [9]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m7, [r3 + 3 * 16] ; [19]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 13 * 16] ; [29]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m2, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m2, [r3 - 9 * 16] ; [7]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m2, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m2, [r3 + 16] ; [17]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m0, m2, [r3 + 6 * 16] ; [22]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
-
- pmaddwd m1, m2, [r3 + 11 * 16] ; [27]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m1, m1
- movhps m1, [r2 + 12] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_8(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_8, 3,6,8
+cglobal intra_pred_ang32_8_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -14579,165 +11535,9 @@
jnz .loop
RET
-%macro MODE_9_27 1
- movu m3, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- palignr m1, m3, 2 ; [9 8 7 6 5 4 3 2]
- punpckhwd m2, m3, m1 ; [9 8 8 7 7 6 6 5]
- punpcklwd m3, m1 ; [5 4 4 3 3 2 2 1]
-
- pmaddwd m4, m3, [r3 - 14 * 16] ; [2]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 10 * 16] ; [6]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 6 * 16] ; [10]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 + 2 * 16] ; [18]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 6 * 16] ; [22]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 + 10 * 16] ; [26]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2 + 4] ; [00]
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- palignr m7, m2, m3, 4
- pmaddwd m4, m7, [r3 - 14 * 16] ; [2]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m7, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m7, [r3 - 10 * 16] ; [6]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m7, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m7, [r3 - 6 * 16] ; [10]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m7, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m7, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m7, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- pmaddwd m4, m7, [r3 + 2 * 16] ; [18]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m7, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m7, [r3 + 6 * 16] ; [22]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m7, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m7, [r3 + 10 * 16] ; [26]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m0, m7, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m6, m0
-
- pmaddwd m7, [r3 + 14 * 16] ; [30]
- paddd m7, [pd_16]
- psrld m7, 5
- packusdw m7, m7
- movhps m7, [r2 + 6] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m7
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_9(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_9, 3,6,8
+cglobal intra_pred_ang32_9_new, 3,6,8
+ add r2, 128
lea r3, [ang_table + 16 * 16]
mov r4d, 8
add r1, r1
@@ -14751,11 +11551,9 @@
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_10(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_10, 4,7,8
+cglobal intra_pred_ang32_10_new, 3,7,8
+ add r2, 128
mov r6d, 4
add r1, r1
lea r5, [r1 * 3]
@@ -14828,167 +11626,12 @@
jnz .loop
RET
-%macro MODE_11_25 1
- movu m3, [r2 + 2] ; [7 6 5 4 3 2 1 0]
- pshufb m3, [pw_punpcklwd] ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2 + 2] ; [00]
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2] ; [6 5 4 3 2 1 0 16]
- pshufb m3, [pw_punpcklwd] ; [3 2 2 1 1 0 0 16]
-
- pmaddwd m4, m3, [r3 + 14 * 16] ; [30]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 10 * 16] ; [26]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 + 6 * 16] ; [22]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 2 * 16] ; [14]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 6 * 16] ; [10]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [8]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 10 * 16] ; [6]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [4]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_11(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_11, 4,6,7,0-(4*mmsize+4)
+cglobal intra_pred_ang32_11_new, 3,6,7,0-(4*mmsize+4)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -15015,178 +11658,12 @@
jnz .loop
RET
-%macro MODE_12_24 1
- movu m3, [r2 + 8] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 16] ; [17]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 9 * 16] ; [7]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [2]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 7 * 16] ; [9]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [4]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [6]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 15 * 16] ; [1]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [8]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 13 * 16] ; [3]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 11 * 16] ; [5]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_12(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_12, 4,6,7,0-(4*mmsize+10)
+cglobal intra_pred_ang32_12_new, 3,6,7,0-(4*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -15221,190 +11698,12 @@
jnz .loop
RET
-%macro MODE_13_23 1
- movu m3, [r2 + 16] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 16] ; [15]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 + 16] ; [17]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_13(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_13, 4,6,7,0-(5*mmsize+2)
+cglobal intra_pred_ang32_13_new, 3,6,7,0-(5*mmsize+2)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -15441,202 +11740,12 @@
jnz .loop
RET
-%macro MODE_14_22 1
- movu m3, [r2 + 24] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 + 16] ; [17]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 - 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_14(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_14, 4,6,7,0-(5*mmsize+10)
+cglobal intra_pred_ang32_14_new, 3,6,7,0-(5*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -15674,214 +11783,12 @@
jnz .loop
RET
-%macro MODE_15_21 1
- movu m3, [r2 + 32] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 - 16] ; [15]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 30]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 3 * 16] ; [13]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 28]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 12 * 16] ; [28]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 5 * 16] ; [11]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 7 * 16] ; [09]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 24]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 9 * 16] ; [07]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 11 * 16] ; [05]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- pmaddwd m6, m3, [r3 - 13 * 16] ; [03]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 15 * 16] ; [01]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 15 * 16] ; [31]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 13 * 16] ; [29]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m0, m3, [r3 - 4 * 16] ; [12]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 11 * 16] ; [27]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 9 * 16] ; [25]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 7 * 16] ; [23]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 5 * 16] ; [21]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 3 * 16] ; [19]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 16] ; [17]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_15(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_15, 4,6,7,0-(6*mmsize+2)
+cglobal intra_pred_ang32_15_new, 3,6,7,0-(6*mmsize+2)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -15920,226 +11827,12 @@
jnz .loop
RET
-%macro MODE_16_20 1
- movu m3, [r2 + 40] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 - 5 * 16] ; [11]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 38]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 6 * 16] ; [22]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 15 * 16] ; [01]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 36]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 4 * 16] ; [12]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 34]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 7 * 16] ; [23]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 14 * 16] ; [02]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 32]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 3 * 16] ; [13]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 30]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- pmaddwd m4, m3, [r3 - 13 * 16] ; [03]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 28]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 2 * 16] ; [14]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 9 * 16] ; [25]
- paddd m5, [pd_16]
- psrld m5, 5
-
- pmaddwd m6, m3, [r3 - 12 * 16] ; [04]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 24]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 16] ; [15]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- pmaddwd m1, m3, [r3 - 11 * 16] ; [05]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 11 * 16] ; [27]
- paddd m4, [pd_16]
- psrld m4, 5
-
- pmaddwd m1, m3, [r3 - 10 * 16] ; [06]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 16] ; [17]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 12 * 16] ; [28]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- pmaddwd m6, m3, [r3 - 9 * 16] ; [07]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 2 * 16] ; [18]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 13 * 16] ; [29]
- paddd m1, [pd_16]
- psrld m1, 5
-
- pmaddwd m0, m3, [r3 - 8 * 16] ; [08]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 3 * 16] ; [19]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 14 * 16] ; [30]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 7 * 16] ; [09]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 4 * 16] ; [20]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 15 * 16] ; [31]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 5 * 16] ; [21]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_16(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_16, 4,6,7,0-(6*mmsize+10)
+cglobal intra_pred_ang32_16_new, 3,6,7,0-(6*mmsize+10)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -16178,239 +11871,12 @@
jnz .loop
RET
-%macro MODE_17_19 1
- movu m3, [r2 + 50] ; [7 6 5 4 3 2 1 0]
- pshufb m3, m2 ; [4 3 3 2 2 1 1 0]
-
- pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 48]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 46]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 44]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 8 * 16] ; [24]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 42]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 40]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 38]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 0, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 36]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 34]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 32]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 30]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 28]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 26]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2 + 26] ; [00]
-
- TRANSPOSE_STORE_8x8 16, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 24]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 - 10 * 16] ; [06]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 22]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 4 * 16] ; [12]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- movu m3, [r2 + 20]
- pshufb m3, m2
-
- pmaddwd m5, m3, [r3 + 2 * 16] ; [18]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 18]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3 + 8 * 16] ; [24]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m5, m0
-
- movu m3, [r2 + 16]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 + 14 * 16] ; [30]
- paddd m6, [pd_16]
- psrld m6, 5
-
- pmaddwd m1, m3, [r3 - 12 * 16] ; [04]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2 + 14]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 - 6 * 16] ; [10]
- paddd m1, [pd_16]
- psrld m1, 5
-
- movu m3, [r2 + 12]
- pshufb m3, m2
-
- pmaddwd m0, m3, [r3] ; [16]
- paddd m0, [pd_16]
- psrld m0, 5
- packusdw m1, m0
-
- TRANSPOSE_STORE_8x8 32, %1, m4, m5, m6, m1
-
- movu m3, [r2 + 10]
- pshufb m3, m2
-
- pmaddwd m4, m3, [r3 + 6 * 16] ; [22]
- paddd m4, [pd_16]
- psrld m4, 5
-
- movu m3, [r2 + 8]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 12 * 16] ; [28]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m4, m1
-
- pmaddwd m5, m3, [r3 - 14 * 16] ; [02]
- paddd m5, [pd_16]
- psrld m5, 5
-
- movu m3, [r2 + 6]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 8 * 16] ; [08]
- paddd m6, [pd_16]
- psrld m6, 5
- packusdw m5, m6
-
- movu m3, [r2 + 4]
- pshufb m3, m2
-
- pmaddwd m6, m3, [r3 - 2 * 16] ; [14]
- paddd m6, [pd_16]
- psrld m6, 5
-
- movu m3, [r2 + 2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 4 * 16] ; [20]
- paddd m1, [pd_16]
- psrld m1, 5
- packusdw m6, m1
-
- movu m3, [r2]
- pshufb m3, m2
-
- pmaddwd m1, m3, [r3 + 10 * 16] ; [26]
- paddd m1, [pd_16]
- psrld m1, 5
-
- packusdw m1, m1
- movhps m1, [r2] ; [00]
-
- TRANSPOSE_STORE_8x8 48, %1, m4, m5, m6, m1
-%endmacro
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_17(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_17, 4,6,7,0-(7*mmsize+4)
+cglobal intra_pred_ang32_17_new, 3,6,7,0-(7*mmsize+4)
+ mov r3, r2mp
+ add r2, 128
movu m0, [r2 + 0*mmsize]
+ pinsrw m0, [r3], 0
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
movu m3, [r2 + 3*mmsize]
@@ -16455,11 +11921,10 @@
jnz .loop
RET
-;-------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_18(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_18, 4,7,8
+cglobal intra_pred_ang32_18_new, 3,7,8
+ mov r3, r2mp
+ add r2, 128
movu m0, [r3] ; [7 6 5 4 3 2 1 0]
movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
@@ -16767,12 +12232,9 @@
movu [r0 + r3 + 48], m6
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_19(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal intra_pred_ang32_19, 4,7,7,0-(7*mmsize+4)
- xchg r2, r3
+cglobal intra_pred_ang32_19_new, 3,7,7,0-(7*mmsize+4)
+ lea r3, [r2 + 128]
movu m0, [r2 + 0*mmsize]
movu m1, [r2 + 1*mmsize]
movu m2, [r2 + 2*mmsize]
@@ -16820,1346 +12282,6 @@
jnz .loop
RET
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_20(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_20, 4,7,7,0-(6*mmsize+10)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 2*mmsize + 8], m0
- movu [rsp + 3*mmsize + 8], m1
- movu [rsp + 4*mmsize + 8], m2
- movu [rsp + 5*mmsize + 8], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 104], r4w
- movu m0, [r3 + 4]
- movu m1, [r3 + 22]
- movu m2, [r3 + 40]
- movd m3, [r3 + 58]
- pshufb m0, [shuf_mode_16_20]
- pshufb m1, [shuf_mode_16_20]
- pshufb m2, [shuf_mode_16_20]
- pshufb m3, [shuf_mode_16_20]
- movu [rsp + 24], m0
- movu [rsp + 12], m1
- movu [rsp], m2
- movd [rsp], m3
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
- mov r6, r0
-
-.loop:
- MODE_16_20 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_21(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_21, 4,7,7,0-(6*mmsize+2)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 2*mmsize], m0
- movu [rsp + 3*mmsize], m1
- movu [rsp + 4*mmsize], m2
- movu [rsp + 5*mmsize], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 96], r4w
- movu m0, [r3 + 4]
- movu m1, [r3 + 18]
- movu m2, [r3 + 34]
- movu m3, [r3 + 48]
- pshufb m0, [shuf_mode_15_21]
- pshufb m1, [shuf_mode_15_21]
- pshufb m2, [shuf_mode_15_21]
- pshufb m3, [shuf_mode_15_21]
- movh [rsp + 24], m0
- movh [rsp + 16], m1
- movh [rsp + 8], m2
- movh [rsp], m3
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
- mov r6, r0
-
-.loop:
- MODE_15_21 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_22(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_22, 4,7,7,0-(5*mmsize+10)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize + 8], m0
- movu [rsp + 2*mmsize + 8], m1
- movu [rsp + 3*mmsize + 8], m2
- movu [rsp + 4*mmsize + 8], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 88], r4w
- mov r4w, [r3+4]
- mov [rsp+22], r4w
- movu m0, [r3 + 10]
- movu m1, [r3 + 30]
- movu m2, [r3 + 50]
- pshufb m0, [shuf_mode_14_22]
- pshufb m1, [shuf_mode_14_22]
- pshufb m2, [shuf_mode_14_22]
- movh [rsp + 14], m0
- movh [rsp + 6], m1
- movh [rsp - 2], m2
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
- mov r6, r0
-
-.loop:
- MODE_14_22 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_23(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_23, 4,7,7,0-(5*mmsize+2)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize], m0
- movu [rsp + 2*mmsize], m1
- movu [rsp + 3*mmsize], m2
- movu [rsp + 4*mmsize], m3
-
- mov r4w, [r2+64]
- mov [rsp+80], r4w
- movu m0, [r3 + 8]
- movu m1, [r3 + 36]
- pshufb m0, [shuf_mode_13_23]
- pshufb m1, [shuf_mode_13_23]
- movh [rsp + 8], m0
- movh [rsp], m1
- mov r4w, [r3+28]
- mov [rsp+8], r4w
- mov r4w, [r3+56]
- mov [rsp], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
- mov r6, r0
-
-.loop:
- MODE_13_23 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_24(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_24, 4,7,7,0-(4*mmsize+10)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
-
- movu [rsp + 0*mmsize + 8], m0
- movu [rsp + 1*mmsize + 8], m1
- movu [rsp + 2*mmsize + 8], m2
- movu [rsp + 3*mmsize + 8], m3
-
- mov r4w, [r2+64]
- mov [rsp+72], r4w
- mov r4w, [r3+12]
- mov [rsp+6], r4w
- mov r4w, [r3+26]
- mov [rsp+4], r4w
- mov r4w, [r3+38]
- mov [rsp+2], r4w
- mov r4w, [r3+52]
- mov [rsp], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_12_24 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_25(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_25, 4,7,7,0-(4*mmsize+4)
- xchg r2, r3
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 0*mmsize + 2], m0
- movu [rsp + 1*mmsize + 2], m1
- movu [rsp + 2*mmsize + 2], m2
- movu [rsp + 3*mmsize + 2], m3
- mov r4w, [r3+32]
- mov [rsp], r4w
- mov r4w, [r2+64]
- mov [rsp+66], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
-
-.loop:
- MODE_11_25 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_26(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_26, 4,7,5
- mov r6d, 4
- add r1, r1
- lea r2, [r1 * 2]
- lea r4, [r1 * 3]
- lea r5, [r1 * 4]
- mova m4, [c_mode32_10_0]
-
- movu m0, [r3 + 2]
- movu m1, [r3 + 18]
- movu m2, [r3 + 34]
- movu m3, [r3 + 50]
-
-.loop:
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-
- movu [r0 + r2], m0
- movu [r0 + r2 + 16], m1
- movu [r0 + r2 + 32], m2
- movu [r0 + r2 + 48], m3
-
- movu [r0 + r4], m0
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m2
- movu [r0 + r4 + 48], m3
-
- add r0, r5
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-
- movu [r0 + r2], m0
- movu [r0 + r2 + 16], m1
- movu [r0 + r2 + 32], m2
- movu [r0 + r2 + 48], m3
-
- movu [r0 + r4], m0
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m2
- movu [r0 + r4 + 48], m3
-
- add r0, r5
- dec r6d
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_27(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_27, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_9_27 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_28(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_28, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_8_28 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_29(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_29, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_7_29 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_30(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_30, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_6_30 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_31(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_31, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_5_31 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_32, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-
-.loop:
- MODE_4_32 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------------------------------
-; void intraPredAng32_33(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------------------------------
-INIT_XMM sse4
-cglobal intra_pred_ang32_33, 4,7,8
- xchg r2, r3mp
- lea r3, [ang_table + 16 * 16]
- add r1, r1
- lea r5, [r1 * 3]
- mov r6, r0
- mov r4d, 8
-.loop:
- MODE_3_33 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-;------------------------------------------------------------------------------------------
-; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel* src, int dirMode, int bFilter)
-;------------------------------------------------------------------------------------------
-INIT_XMM ssse3
-cglobal intra_pred_ang32_2_new, 3,6,6
- lea r4, [r2]
- add r2, 128
- cmp r3m, byte 34
- cmove r2, r4
-
- add r1, r1
- lea r3, [r1 * 2]
- lea r4, [r1 * 3]
- mov r5, 2
-
-.loop:
- MODE_2_34
- add r2, 32
- dec r5
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_3_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_3_33 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_4_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_4_32 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_5_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_5_31 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_6_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_6_30 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_7_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_7_29 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_8_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_8_28 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_9_new, 3,6,8
- add r2, 128
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_9_27 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_10_new, 3,7,8
- add r2, 128
- mov r6d, 4
- add r1, r1
- lea r5, [r1 * 3]
- lea r4, [r1 * 2]
- lea r3, [r1 * 4]
- mova m7, [c_mode32_10_0]
-
-.loop:
- movu m0, [r2 + 2]
- pshufb m1, m0, m7
- movu [r0], m1
- movu [r0 + 16], m1
- movu [r0 + 32], m1
- movu [r0 + 48], m1
-
- palignr m1, m0, 2
- pshufb m1, m7
- movu [r0 + r1], m1
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m1
- movu [r0 + r1 + 48], m1
-
- palignr m1, m0, 4
- pshufb m1, m7
- movu [r0 + r4], m1
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m1
- movu [r0 + r4 + 48], m1
-
- palignr m1, m0, 6
- pshufb m1, m7
- movu [r0 + r5], m1
- movu [r0 + r5 + 16], m1
- movu [r0 + r5 + 32], m1
- movu [r0 + r5 + 48], m1
-
- add r0, r3
-
- palignr m1, m0, 8
- pshufb m1, m7
- movu [r0], m1
- movu [r0 + 16], m1
- movu [r0 + 32], m1
- movu [r0 + 48], m1
-
- palignr m1, m0, 10
- pshufb m1, m7
- movu [r0 + r1], m1
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m1
- movu [r0 + r1 + 48], m1
-
- palignr m1, m0, 12
- pshufb m1, m7
- movu [r0 + r4], m1
- movu [r0 + r4 + 16], m1
- movu [r0 + r4 + 32], m1
- movu [r0 + r4 + 48], m1
-
- palignr m1, m0, 14
- pshufb m1, m7
- movu [r0 + r5], m1
- movu [r0 + r5 + 16], m1
- movu [r0 + r5 + 32], m1
- movu [r0 + r5 + 48], m1
-
- add r0, r3
- add r2, 16
- dec r6d
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_11_new, 3,6,7,0-(4*mmsize+4)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 0*mmsize + 2], m0
- movu [rsp + 1*mmsize + 2], m1
- movu [rsp + 2*mmsize + 2], m2
- movu [rsp + 3*mmsize + 2], m3
- mov r4w, [r3+32]
- mov [rsp], r4w
- mov r4w, [r2+64]
- mov [rsp+66], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
-
-.loop:
- MODE_11_25 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_12_new, 3,6,7,0-(4*mmsize+10)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 0*mmsize + 8], m0
- movu [rsp + 1*mmsize + 8], m1
- movu [rsp + 2*mmsize + 8], m2
- movu [rsp + 3*mmsize + 8], m3
-
- mov r4w, [r2+64]
- mov [rsp+72], r4w
- mov r4w, [r3+12]
- mov [rsp+6], r4w
- mov r4w, [r3+26]
- mov [rsp+4], r4w
- mov r4w, [r3+38]
- mov [rsp+2], r4w
- mov r4w, [r3+52]
- mov [rsp], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_12_24 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_13_new, 3,6,7,0-(5*mmsize+2)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize], m0
- movu [rsp + 2*mmsize], m1
- movu [rsp + 3*mmsize], m2
- movu [rsp + 4*mmsize], m3
-
- mov r4w, [r2+64]
- mov [rsp+80], r4w
- movu m0, [r3 + 8]
- movu m1, [r3 + 36]
- pshufb m0, [shuf_mode_13_23]
- pshufb m1, [shuf_mode_13_23]
- movh [rsp + 8], m0
- movh [rsp], m1
- mov r4w, [r3+28]
- mov [rsp+8], r4w
- mov r4w, [r3+56]
- mov [rsp], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_13_23 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_14_new, 3,6,7,0-(5*mmsize+10)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 1*mmsize + 8], m0
- movu [rsp + 2*mmsize + 8], m1
- movu [rsp + 3*mmsize + 8], m2
- movu [rsp + 4*mmsize + 8], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 88], r4w
- mov r4w, [r3+4]
- mov [rsp+22], r4w
- movu m0, [r3 + 10]
- movu m1, [r3 + 30]
- movu m2, [r3 + 50]
- pshufb m0, [shuf_mode_14_22]
- pshufb m1, [shuf_mode_14_22]
- pshufb m2, [shuf_mode_14_22]
- movh [rsp + 14], m0
- movh [rsp + 6], m1
- movh [rsp - 2], m2
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_14_22 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_15_new, 3,6,7,0-(6*mmsize+2)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 2*mmsize], m0
- movu [rsp + 3*mmsize], m1
- movu [rsp + 4*mmsize], m2
- movu [rsp + 5*mmsize], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 96], r4w
- movu m0, [r3 + 4]
- movu m1, [r3 + 18]
- movu m2, [r3 + 34]
- movu m3, [r3 + 48]
- pshufb m0, [shuf_mode_15_21]
- pshufb m1, [shuf_mode_15_21]
- pshufb m2, [shuf_mode_15_21]
- pshufb m3, [shuf_mode_15_21]
- movh [rsp + 24], m0
- movh [rsp + 16], m1
- movh [rsp + 8], m2
- movh [rsp], m3
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_15_21 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_16_new, 3,6,7,0-(6*mmsize+10)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 2*mmsize + 8], m0
- movu [rsp + 3*mmsize + 8], m1
- movu [rsp + 4*mmsize + 8], m2
- movu [rsp + 5*mmsize + 8], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 104], r4w
- movu m0, [r3 + 4]
- movu m1, [r3 + 22]
- movu m2, [r3 + 40]
- movd m3, [r3 + 58]
- pshufb m0, [shuf_mode_16_20]
- pshufb m1, [shuf_mode_16_20]
- pshufb m2, [shuf_mode_16_20]
- pshufb m3, [shuf_mode_16_20]
- movu [rsp + 24], m0
- movu [rsp + 12], m1
- movu [rsp], m2
- movd [rsp], m3
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_16_20 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_17_new, 3,6,7,0-(7*mmsize+4)
- mov r3, r2mp
- add r2, 128
- movu m0, [r2 + 0*mmsize]
- pinsrw m0, [r3], 0
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 3*mmsize + 2], m0
- movu [rsp + 4*mmsize + 2], m1
- movu [rsp + 5*mmsize + 2], m2
- movu [rsp + 6*mmsize + 2], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 114], r4w
- movu m0, [r3 + 8]
- movu m1, [r3 + 30]
- movu m2, [r3 + 50]
- movd m3, [r3 + 2]
- pshufb m0, [shuf_mode_17_19]
- pshufb m1, [shuf_mode_17_19]
- pshufb m2, [shuf_mode_17_19]
- pshufb m3, [shuf_mode_16_20]
- movd [rsp + 46], m3
- movu [rsp + 30], m0
- movu [rsp + 12], m1
- movu [rsp - 4], m2
- mov r4w, [r3 + 24]
- mov [rsp + 30], r4w
- mov r4w, [r3 + 28]
- mov [rsp + 28], r4w
- mov r4w, [r3 + 46]
- mov [rsp + 12], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
-
-.loop:
- MODE_17_19 1
- lea r0, [r0 + r1 * 4 ]
- add r2, 8
- dec r4
- jnz .loop
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_18_new, 3,7,8
- mov r3, r2mp
- add r2, 128
- movu m0, [r3] ; [7 6 5 4 3 2 1 0]
- movu m1, [r3 + 16] ; [15 14 13 12 11 10 9 8]
- movu m2, [r3 + 32] ; [23 22 21 20 19 18 17 16]
- movu m3, [r3 + 48] ; [31 30 29 28 27 26 25 24]
- movu m4, [r2 + 2] ; [8 7 6 5 4 3 2 1]
- movu m5, [r2 + 18] ; [16 15 14 13 12 11 10 9]
-
- add r1, r1
- lea r6, [r1 * 2]
- lea r3, [r1 * 3]
- lea r4, [r1 * 4]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- pshufb m4, [shuf_mode32_18] ; [1 2 3 4 5 6 7 8]
- pshufb m5, [shuf_mode32_18] ; [9 10 11 12 13 14 15 16]
-
- palignr m6, m0, m4, 14
- movu [r0 + r1], m6
- palignr m6, m1, m0, 14
- movu [r0 + r1 + 16], m6
- palignr m6, m2, m1, 14
- movu [r0 + r1 + 32], m6
- palignr m6, m3, m2, 14
- movu [r0 + r1 + 48], m6
-
- palignr m6, m0, m4, 12
- movu [r0 + r6], m6
- palignr m6, m1, m0, 12
- movu [r0 + r6 + 16], m6
- palignr m6, m2, m1, 12
- movu [r0 + r6 + 32], m6
- palignr m6, m3, m2, 12
- movu [r0 + r6 + 48], m6
-
- palignr m6, m0, m4, 10
- movu [r0 + r3], m6
- palignr m6, m1, m0, 10
- movu [r0 + r3 + 16], m6
- palignr m6, m2, m1, 10
- movu [r0 + r3 + 32], m6
- palignr m6, m3, m2, 10
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- palignr m6, m0, m4, 8
- movu [r0], m6
- palignr m6, m1, m0, 8
- movu [r0 + 16], m6
- palignr m6, m2, m1, 8
- movu [r0 + 32], m6
- palignr m6, m3, m2, 8
- movu [r0 + 48], m6
-
- palignr m6, m0, m4, 6
- movu [r0 + r1], m6
- palignr m6, m1, m0, 6
- movu [r0 + r1 + 16], m6
- palignr m6, m2, m1, 6
- movu [r0 + r1 + 32], m6
- palignr m6, m3, m2, 6
- movu [r0 + r1 + 48], m6
-
- palignr m6, m0, m4, 4
- movu [r0 + r6], m6
- palignr m6, m1, m0, 4
- movu [r0 + r6 + 16], m6
- palignr m6, m2, m1, 4
- movu [r0 + r6 + 32], m6
- palignr m6, m3, m2, 4
- movu [r0 + r6 + 48], m6
-
- palignr m6, m0, m4, 2
- movu [r0 + r3], m6
- palignr m6, m1, m0, 2
- movu [r0 + r3 + 16], m6
- palignr m6, m2, m1, 2
- movu [r0 + r3 + 32], m6
- palignr m6, m3, m2, 2
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- movu [r0], m4
- movu [r0 + 16], m0
- movu [r0 + 32], m1
- movu [r0 + 48], m2
-
- palignr m6, m4, m5, 14
- movu [r0 + r1], m6
- palignr m6, m0, m4, 14
- movu [r0 + r1 + 16], m6
- palignr m6, m1, m0, 14
- movu [r0 + r1 + 32], m6
- palignr m6, m2, m1, 14
- movu [r0 + r1 + 48], m6
-
- palignr m6, m4, m5, 12
- movu [r0 + r6], m6
- palignr m6, m0, m4, 12
- movu [r0 + r6 + 16], m6
- palignr m6, m1, m0, 12
- movu [r0 + r6 + 32], m6
- palignr m6, m2, m1, 12
- movu [r0 + r6 + 48], m6
-
- palignr m6, m4, m5, 10
- movu [r0 + r3], m6
- palignr m6, m0, m4, 10
- movu [r0 + r3 + 16], m6
- palignr m6, m1, m0, 10
- movu [r0 + r3 + 32], m6
- palignr m6, m2, m1, 10
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- palignr m6, m4, m5, 8
- movu [r0], m6
- palignr m6, m0, m4, 8
- movu [r0 + 16], m6
- palignr m6, m1, m0, 8
- movu [r0 + 32], m6
- palignr m6, m2, m1, 8
- movu [r0 + 48], m6
-
- palignr m6, m4, m5, 6
- movu [r0 + r1], m6
- palignr m6, m0, m4, 6
- movu [r0 + r1 + 16], m6
- palignr m6, m1, m0, 6
- movu [r0 + r1 + 32], m6
- palignr m6, m2, m1, 6
- movu [r0 + r1 + 48], m6
-
- palignr m6, m4, m5, 4
- movu [r0 + r6], m6
- palignr m6, m0, m4, 4
- movu [r0 + r6 + 16], m6
- palignr m6, m1, m0, 4
- movu [r0 + r6 + 32], m6
- palignr m6, m2, m1, 4
- movu [r0 + r6 + 48], m6
-
- palignr m6, m4, m5, 2
- movu [r0 + r3], m6
- palignr m6, m0, m4, 2
- movu [r0 + r3 + 16], m6
- palignr m6, m1, m0, 2
- movu [r0 + r3 + 32], m6
- palignr m6, m2, m1, 2
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- movu m2, [r2 + 34]
- movu m3, [r2 + 50]
- pshufb m2, [shuf_mode32_18]
- pshufb m3, [shuf_mode32_18]
-
- movu [r0], m5
- movu [r0 + 16], m4
- movu [r0 + 32], m0
- movu [r0 + 48], m1
-
- palignr m6, m5, m2, 14
- movu [r0 + r1], m6
- palignr m6, m4, m5, 14
- movu [r0 + r1 + 16], m6
- palignr m6, m0, m4, 14
- movu [r0 + r1 + 32], m6
- palignr m6, m1, m0, 14
- movu [r0 + r1 + 48], m6
-
- palignr m6, m5, m2, 12
- movu [r0 + r6], m6
- palignr m6, m4, m5, 12
- movu [r0 + r6 + 16], m6
- palignr m6, m0, m4, 12
- movu [r0 + r6 + 32], m6
- palignr m6, m1, m0, 12
- movu [r0 + r6 + 48], m6
-
- palignr m6, m5, m2, 10
- movu [r0 + r3], m6
- palignr m6, m4, m5, 10
- movu [r0 + r3 + 16], m6
- palignr m6, m0, m4, 10
- movu [r0 + r3 + 32], m6
- palignr m6, m1, m0, 10
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- palignr m6, m5, m2, 8
- movu [r0], m6
- palignr m6, m4, m5, 8
- movu [r0 + 16], m6
- palignr m6, m0, m4, 8
- movu [r0 + 32], m6
- palignr m6, m1, m0, 8
- movu [r0 + 48], m6
-
- palignr m6, m5, m2, 6
- movu [r0 + r1], m6
- palignr m6, m4, m5, 6
- movu [r0 + r1 + 16], m6
- palignr m6, m0, m4, 6
- movu [r0 + r1 + 32], m6
- palignr m6, m1, m0, 6
- movu [r0 + r1 + 48], m6
-
- palignr m6, m5, m2, 4
- movu [r0 + r6], m6
- palignr m6, m4, m5, 4
- movu [r0 + r6 + 16], m6
- palignr m6, m0, m4, 4
- movu [r0 + r6 + 32], m6
- palignr m6, m1, m0, 4
- movu [r0 + r6 + 48], m6
-
- palignr m6, m5, m2, 2
- movu [r0 + r3], m6
- palignr m6, m4, m5, 2
- movu [r0 + r3 + 16], m6
- palignr m6, m0, m4, 2
- movu [r0 + r3 + 32], m6
- palignr m6, m1, m0, 2
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- movu [r0], m2
- movu [r0 + 16], m5
- movu [r0 + 32], m4
- movu [r0 + 48], m0
-
- palignr m6, m2, m3, 14
- movu [r0 + r1], m6
- palignr m6, m5, m2, 14
- movu [r0 + r1 + 16], m6
- palignr m6, m4, m5, 14
- movu [r0 + r1 + 32], m6
- palignr m6, m0, m4, 14
- movu [r0 + r1 + 48], m6
-
- palignr m6, m2, m3, 12
- movu [r0 + r6], m6
- palignr m6, m5, m2, 12
- movu [r0 + r6 + 16], m6
- palignr m6, m4, m5, 12
- movu [r0 + r6 + 32], m6
- palignr m6, m0, m4, 12
- movu [r0 + r6 + 48], m6
-
- palignr m6, m2, m3, 10
- movu [r0 + r3], m6
- palignr m6, m5, m2, 10
- movu [r0 + r3 + 16], m6
- palignr m6, m4, m5, 10
- movu [r0 + r3 + 32], m6
- palignr m6, m0, m4, 10
- movu [r0 + r3 + 48], m6
-
- add r0, r4
-
- palignr m6, m2, m3, 8
- movu [r0], m6
- palignr m6, m5, m2, 8
- movu [r0 + 16], m6
- palignr m6, m4, m5, 8
- movu [r0 + 32], m6
- palignr m6, m0, m4, 8
- movu [r0 + 48], m6
-
- palignr m6, m2, m3, 6
- movu [r0 + r1], m6
- palignr m6, m5, m2, 6
- movu [r0 + r1 + 16], m6
- palignr m6, m4, m5, 6
- movu [r0 + r1 + 32], m6
- palignr m6, m0, m4, 6
- movu [r0 + r1 + 48], m6
-
- palignr m6, m2, m3, 4
- movu [r0 + r6], m6
- palignr m6, m5, m2, 4
- movu [r0 + r6 + 16], m6
- palignr m6, m4, m5, 4
- movu [r0 + r6 + 32], m6
- palignr m6, m0, m4, 4
- movu [r0 + r6 + 48], m6
-
- palignr m6, m2, m3, 2
- movu [r0 + r3], m6
- palignr m6, m5, m2, 2
- movu [r0 + r3 + 16], m6
- palignr m6, m4, m5, 2
- movu [r0 + r3 + 32], m6
- palignr m6, m0, m4, 2
- movu [r0 + r3 + 48], m6
- RET
-
-INIT_XMM sse4
-cglobal intra_pred_ang32_19_new, 3,7,7,0-(7*mmsize+4)
- lea r3, [r2 + 128]
- movu m0, [r2 + 0*mmsize]
- movu m1, [r2 + 1*mmsize]
- movu m2, [r2 + 2*mmsize]
- movu m3, [r2 + 3*mmsize]
- movu [rsp + 3*mmsize + 2], m0
- movu [rsp + 4*mmsize + 2], m1
- movu [rsp + 5*mmsize + 2], m2
- movu [rsp + 6*mmsize + 2], m3
-
- mov r4w, [r2 + 64]
- mov [rsp + 114], r4w
- movu m0, [r3 + 8]
- movu m1, [r3 + 30]
- movu m2, [r3 + 50]
- movd m3, [r3 + 2]
- pshufb m0, [shuf_mode_17_19]
- pshufb m1, [shuf_mode_17_19]
- pshufb m2, [shuf_mode_17_19]
- pshufb m3, [shuf_mode_16_20]
- movd [rsp + 46], m3
- movu [rsp + 30], m0
- movu [rsp + 12], m1
- movu [rsp - 4], m2
- mov r4w, [r3 + 24]
- mov [rsp + 30], r4w
- mov r4w, [r3 + 28]
- mov [rsp + 28], r4w
- mov r4w, [r3 + 46]
- mov [rsp + 12], r4w
-
- lea r3, [ang_table + 16 * 16]
- mov r4d, 8
- mov r2, rsp
- add r1, r1
- lea r5, [r1 * 3]
- mova m2, [pw_punpcklwd]
- mov r6, r0
-
-.loop:
- MODE_17_19 0
- add r6, 8
- mov r0, r6
- add r2, 8
- dec r4
- jnz .loop
- RET
-
INIT_XMM sse4
cglobal intra_pred_ang32_20_new, 3,7,7,0-(6*mmsize+10)
lea r3, [r2 + 128]
More information about the x265-devel
mailing list