[x265] [PATCH] asm: Modifications to intrapred16 modes 3, 4, 32 and 33 such that it uses TRANSPOSE_STORE macro of intrapred32
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Feb 4 07:16:09 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1391494543 -19800
# Tue Feb 04 11:45:43 2014 +0530
# Node ID 17e6299fe4107448f5cc0eaef6d90795b73abd04
# Parent 930b251ac6b7c59edb12f5a872a38123e4056d9c
asm: Modifications to intrapred16 modes 3, 4, 32 and 33 such that it uses TRANSPOSE_STORE macro of intrapred32
diff -r 930b251ac6b7 -r 17e6299fe410 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Feb 03 18:45:25 2014 -0600
+++ b/source/common/x86/intrapred8.asm Tue Feb 04 11:45:43 2014 +0530
@@ -1396,40 +1396,54 @@
movu [r0 + r1], m2
RET
-
-%macro TRANSPOSE_STORE_8x8 1
- punpckhbw m0, m4, m5
- punpcklbw m4, m5
- punpckhbw m2, m4, m0
- punpcklbw m4, m0
-
- punpckhbw m0, m6, m1
- punpcklbw m6, m1
- punpckhbw m1, m6, m0
- punpcklbw m6, m0
-
- punpckhdq m5, m4, m6
- punpckldq m4, m6
- punpckldq m6, m2, m1
- punpckhdq m2, m1
-
- movh [r0 + + %1 * 8], m4
- movhps [r0 + r1 + %1 * 8], m4
- movh [r0 + r1*2 + %1 * 8], m5
- movhps [r0 + r5 + %1 * 8], m5
- movh [r6 + %1 * 8], m6
- movhps [r6 + r1 + %1 * 8], m6
- movh [r6 + r1*2 + %1 * 8], m2
- movhps [r6 + r5 + %1 * 8], m2
+%macro TRANSPOSE_STORE_8x8 6
+ %if %2 == 1
+ ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
+ punpckhbw m0, %3, %4
+ punpcklbw %3, %4
+ punpckhbw %4, %3, m0
+ punpcklbw %3, m0
+
+ punpckhbw m0, %5, m1
+ punpcklbw %5, %6
+ punpckhbw %6, %5, m0
+ punpcklbw %5, m0
+
+ punpckhdq m0, %3, %5
+ punpckldq %3, %5
+ punpckldq %5, %4, %6
+ punpckhdq %4, %6
+
+ movh [r0 + + %1 * 8], %3
+ movhps [r0 + r1 + %1 * 8], %3
+ movh [r0 + r1*2 + %1 * 8], m0
+ movhps [r0 + r5 + %1 * 8], m0
+ movh [r6 + %1 * 8], %5
+ movhps [r6 + r1 + %1 * 8], %5
+ movh [r6 + r1*2 + %1 * 8], %4
+ movhps [r6 + r5 + %1 * 8], %4
+ %else
+ ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
+ movh [r0 ], %3
+ movhps [r0 + r1 ], %3
+ movh [r0 + r1 * 2], %4
+ movhps [r0 + r5 ], %4
+ lea r0, [r0 + r1 * 4]
+ movh [r0 ], %5
+ movhps [r0 + r1 ], %5
+ movh [r0 + r1 * 2], %6
+ movhps [r0 + r5 ], %6
+ lea r0, [r0 + r1 * 4]
+ %endif
%endmacro
INIT_XMM sse4
cglobal intra_pred_ang16_3, 3,7,8
lea r3, [ang_table + 16 * 16]
- mov r4d, 2
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ mov r4d, 2
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
@@ -1486,7 +1500,7 @@
pmulhrsw m2, m7
packuswb m1, m2
- TRANSPOSE_STORE_8x8 0
+ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
@@ -1537,36 +1551,24 @@
packuswb m1, m1
movhps m1, [r2 + 14] ; [00]
- TRANSPOSE_STORE_8x8 1
-
- lea r0, [r6 + r1 * 4]
- lea r6, [r6 + r1 * 8]
- add r2, 8
+ TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
+
+ lea r0, [r6 + r1 * 4]
+ lea r6, [r6 + r1 * 8]
+ add r2, 8
dec r4
jnz .loop
RET
-%macro STORE_8x8 4 ; rows 1-2, 3-4, 5-6, 7-8
- movh [r0 ], %1
- movhps [r0 + r1 ], %1
- movh [r0 + r1 * 2], %2
- movhps [r0 + r5 ], %2
- lea r0, [r0 + r1 * 4]
- movh [r0 ], %3
- movhps [r0 + r1 ], %3
- movh [r0 + r1 * 2], %4
- movhps [r0 + r5 ], %4
-%endmacro
-
INIT_XMM sse4
cglobal intra_pred_ang16_33, 3,7,8
- mov r2, r3mp
- lea r3, [ang_table + 16 * 16]
- mov r4d, 2
- lea r5, [r1 * 3]
- mov r6, r0
- mova m7, [pw_1024]
+ mov r2, r3mp
+ lea r3, [ang_table + 16 * 16]
+ mov r4d, 2
+ lea r5, [r1 * 3]
+ mov r6, r0
+ mova m7, [pw_1024]
.loop:
movu m0, [r2 + 1]
@@ -1622,7 +1624,7 @@
pmulhrsw m2, m7
packuswb m1, m2
- STORE_8x8 m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
movu m0, [r2 + 8]
palignr m1, m0, 1
@@ -1673,7 +1675,6 @@
packuswb m1, m1
movh m2, [r2 + 14] ; [00]
- lea r0, [r0 + r1 * 4]
movh [r0 ], m4
movhps [r0 + r1 ], m4
movh [r0 + r1 * 2], m5
@@ -1696,8 +1697,8 @@
lea r3, [ang_table + 16 * 16]
mov r4d, 2
- lea r5, [r1 * 3] ; r5 -> 3 * stride
- lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r6, [r0 + r1 * 4] ; r6 -> 4 * stride
mova m7, [pw_1024]
.loop:
@@ -1709,8 +1710,8 @@
palignr m1, m2, m0, 2
mova m5, m1
- movu m3, [r3 + 5 * 16] ; [21]
- movu m6, [r3 - 6 * 16] ; [10]
+ movu m3, [r3 + 5 * 16] ; [21]
+ movu m6, [r3 - 6 * 16] ; [10]
pmaddubsw m4, m0, m3
pmulhrsw m4, m7
@@ -1747,17 +1748,17 @@
pmulhrsw m1, m7
palignr m2, m0, 10
- mova m3, m2
+
movu m0, [r3 - 8 * 16] ; [8]
- pmaddubsw m2, m0
- pmulhrsw m2, m7
- packuswb m1, m2
-
- TRANSPOSE_STORE_8x8 0
+ pmaddubsw m3, m2, m0
+ pmulhrsw m3, m7
+ packuswb m1, m3
+
+ TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
movu m0, [r3 + 13 * 16] ; [29]
- pmaddubsw m4, m3, m0
+ pmaddubsw m4, m2, m0
pmulhrsw m4, m7
movu m0, [r2 + 6]
@@ -1767,7 +1768,7 @@
punpcklbw m0, m1
palignr m1, m2, m0, 2
- movu m6, [r3 + 2 * 16] ; [18]
+ movu m6, [r3 + 2 * 16] ; [18]
pmaddubsw m1, m6
pmulhrsw m1, m7
@@ -1776,7 +1777,7 @@
palignr m5, m2, m0, 4
movu m6, m5
- movu m3, [r3 - 9 * 16] ; [07]
+ movu m3, [r3 - 9 * 16] ; [07]
pmaddubsw m5, m3
pmulhrsw m5, m7
@@ -1792,9 +1793,10 @@
pmulhrsw m6, m7
palignr m1, m2, m0, 8
-
-
- pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
+ palignr m2, m0, 10
+
+ movu m0, [r3 - 10 * 16] ; [06]
+ pmaddubsw m3, m1, m0
pmulhrsw m3, m7
packuswb m6, m3
@@ -1802,14 +1804,12 @@
pmaddubsw m1, m3
pmulhrsw m1, m7
- palignr m2, m0, 10
-
movu m3, [r3] ; [16]
pmaddubsw m2, m3
pmulhrsw m2, m7
packuswb m1, m2
- TRANSPOSE_STORE_8x8 1
+ TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
lea r0, [r6 + r1 * 4]
lea r6, [r6 + r1 * 8]
@@ -1824,7 +1824,7 @@
mov r2, r3mp
lea r3, [ang_table + 16 * 16]
mov r4d, 2
- lea r5, [r1 * 3] ; r5 -> 3 * stride
+ lea r5, [r1 * 3] ; r5 -> 3 * stride
mov r6, r0
mova m7, [pw_1024]
@@ -1837,8 +1837,8 @@
palignr m1, m2, m0, 2
mova m5, m1
- movu m3, [r3 + 5 * 16] ; [21]
- movu m6, [r3 - 6 * 16] ; [10]
+ movu m3, [r3 + 5 * 16] ; [21]
+ movu m6, [r3 - 6 * 16] ; [10]
pmaddubsw m4, m0, m3
pmulhrsw m4, m7
@@ -1875,17 +1875,16 @@
pmulhrsw m1, m7
palignr m2, m0, 10
- mova m3, m2
movu m0, [r3 - 8 * 16] ; [8]
- pmaddubsw m2, m0
- pmulhrsw m2, m7
- packuswb m1, m2
-
- STORE_8x8 m4, m5, m6, m1
+ pmaddubsw m3, m2, m0
+ pmulhrsw m3, m7
+ packuswb m1, m3
+
+ TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
movu m0, [r3 + 13 * 16] ; [29]
- pmaddubsw m4, m3, m0
+ pmaddubsw m4, m2, m0
pmulhrsw m4, m7
movu m0, [r2 + 6]
@@ -1895,7 +1894,7 @@
punpcklbw m0, m1
palignr m1, m2, m0, 2
- movu m6, [r3 + 2 * 16] ; [18]
+ movu m6, [r3 + 2 * 16] ; [18]
pmaddubsw m1, m6
pmulhrsw m1, m7
@@ -1904,7 +1903,7 @@
palignr m5, m2, m0, 4
movu m6, m5
- movu m3, [r3 - 9 * 16] ; [07]
+ movu m3, [r3 - 9 * 16] ; [07]
pmaddubsw m5, m3
pmulhrsw m5, m7
@@ -1920,9 +1919,10 @@
pmulhrsw m6, m7
palignr m1, m2, m0, 8
-
-
- pmaddubsw m3, m1, [r3 - 10 * 16] ; [06]
+ palignr m2, m0, 10
+
+ movu m0, [r3 - 10 * 16] ; [06]
+ pmaddubsw m3, m1, m0
pmulhrsw m3, m7
packuswb m6, m3
@@ -1930,15 +1930,12 @@
pmaddubsw m1, m3
pmulhrsw m1, m7
- palignr m2, m0, 10
-
movu m3, [r3] ; [16]
pmaddubsw m2, m3
pmulhrsw m2, m7
packuswb m1, m2
- lea r0, [r0 + r1 * 4]
- STORE_8x8 m4, m5, m6, m1
+ TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
lea r0, [r6 + 8]
add r2, 8
More information about the x265-devel
mailing list