[x265] [PATCH] asm: Modifications to intrapred16 modes 3, 4, 32 and 33 such that it uses TRANSPOSE_STORE macro of intrapred32

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Feb 4 08:30:52 CET 2014


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1391499044 -19800
#      Tue Feb 04 13:00:44 2014 +0530
# Node ID 711827aaab063bb0e02aa6ae52cdd9e7b8b9fef4
# Parent  930b251ac6b7c59edb12f5a872a38123e4056d9c
asm: Modifications to intrapred16 modes 3, 4, 32 and 33 such that it uses TRANSPOSE_STORE macro of intrapred32

diff -r 930b251ac6b7 -r 711827aaab06 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Feb 03 18:45:25 2014 -0600
+++ b/source/common/x86/intrapred8.asm	Tue Feb 04 13:00:44 2014 +0530
@@ -1396,40 +1396,54 @@
     movu            [r0 + r1], m2
     RET
 
-
-%macro TRANSPOSE_STORE_8x8 1
-    punpckhbw   m0,        m4, m5
-    punpcklbw   m4,        m5
-    punpckhbw   m2,        m4, m0
-    punpcklbw   m4,        m0
-
-    punpckhbw   m0,        m6, m1
-    punpcklbw   m6,        m1
-    punpckhbw   m1,        m6, m0
-    punpcklbw   m6,        m0
-
-    punpckhdq   m5,        m4, m6
-    punpckldq   m4,        m6
-    punpckldq   m6,        m2, m1
-    punpckhdq   m2,        m1
-
-    movh        [r0 +       + %1 * 8], m4
-    movhps      [r0 +  r1   + %1 * 8], m4
-    movh        [r0 +  r1*2 + %1 * 8], m5
-    movhps      [r0 +  r5   + %1 * 8], m5
-    movh        [r6         + %1 * 8], m6
-    movhps      [r6 +  r1   + %1 * 8], m6
-    movh        [r6 +  r1*2 + %1 * 8], m2
-    movhps      [r6 +  r5   + %1 * 8], m2
+%macro TRANSPOSE_STORE_8x8 6
+  %if %2 == 1
+    ; transpose 8x8 and then store, used by angle BLOCK_16x16 and BLOCK_32x32
+    punpckhbw   m0,        %3, %4
+    punpcklbw   %3,        %4
+    punpckhbw   %4,        %3, m0
+    punpcklbw   %3,        m0
+
+    punpckhbw   m0,        %5, m1
+    punpcklbw   %5,        %6
+    punpckhbw   %6,        %5, m0
+    punpcklbw   %5,        m0
+
+    punpckhdq   m0,        %3, %5
+    punpckldq   %3,        %5
+    punpckldq   %5,        %4, %6
+    punpckhdq   %4,        %6
+
+    movh        [r0 +       + %1 * 8], %3
+    movhps      [r0 +  r1   + %1 * 8], %3
+    movh        [r0 +  r1*2 + %1 * 8], m0
+    movhps      [r0 +  r5   + %1 * 8], m0
+    movh        [r6         + %1 * 8], %5
+    movhps      [r6 +  r1   + %1 * 8], %5
+    movh        [r6 +  r1*2 + %1 * 8], %4
+    movhps      [r6 +  r5   + %1 * 8], %4
+  %else
+    ; store 8x8, used by angle BLOCK_16x16 and BLOCK_32x32
+    movh        [r0         ], %3
+    movhps      [r0 + r1    ], %3
+    movh        [r0 + r1 * 2], %4
+    movhps      [r0 + r5    ], %4
+    lea         r0, [r0 + r1 * 4]
+    movh        [r0         ], %5
+    movhps      [r0 + r1    ], %5
+    movh        [r0 + r1 * 2], %6
+    movhps      [r0 + r5    ], %6
+    lea         r0, [r0 + r1 * 4]
+  %endif
 %endmacro
 
 INIT_XMM sse4
 cglobal intra_pred_ang16_3, 3,7,8
 
     lea         r3,        [ang_table + 16 * 16]
-    mov         r4d, 2
-    lea         r5, [r1 * 3]            ; r5 -> 3 * stride
-    lea         r6, [r0 + r1 * 4]       ; r6 -> 4 * stride
+    mov         r4d,       2
+    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
+    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
     mova        m7,        [pw_1024]
 
 .loop:
@@ -1440,53 +1454,44 @@
     punpcklbw   m0,        m1
     palignr     m1,        m2, m0, 2
 
-    movu        m3,        [r3 + 10 * 16]             ; [26]
-    movu        m6,        [r3 + 4 * 16]              ; [20]
-
-    pmaddubsw   m4,        m0, m3
+    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
     palignr     m5,        m2, m0, 4
 
-    movu        m3,        [r3 - 2 * 16]              ; [14]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 6
 
-    movu        m3,        [r3 - 8 * 16]              ; [ 8]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 - 14 * 16]             ; [ 2]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
     pmulhrsw    m6,        m7
 
-    movu        m3,        [r3 + 12 * 16]             ; [28]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 10
 
-    movu        m3,        [r3 + 6 * 16]              ; [22]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
     pmulhrsw    m1,        m7
 
     palignr     m2,        m0, 12
 
-    movu        m3,        [r3]                       ; [16]
-    pmaddubsw   m2,        m3
+    pmaddubsw   m2,        [r3]                       ; [16]
     pmulhrsw    m2,        m7
     packuswb    m1,        m2
 
-    TRANSPOSE_STORE_8x8 0
+    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
 
     movu        m0,        [r2 + 8]
     palignr     m1,        m0, 1
@@ -1495,78 +1500,58 @@
     punpcklbw   m0,        m1
     palignr     m5,        m2, m0, 2
 
-    movu        m3,        [r3 - 6 * 16]              ; [10]
-    movu        m6,        [r3 - 12 * 16]             ; [04]
-
-    pmaddubsw   m4,        m0, m3
+    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m5, m6
+    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
-    movu        m3,        [r3 + 14 * 16]             ; [30]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 4
 
-    movu        m3,        [r3 + 8 * 16]              ; [24]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 6
 
-    movu        m3,        [r3 + 2 * 16]              ; [18]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
     pmulhrsw    m6,        m7
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 - 4 * 16]              ; [12]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 10
 
-    movu        m3,        [r3 - 10 * 16]             ; [06]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
     pmulhrsw    m1,        m7
-
     packuswb    m1,        m1
+
     movhps      m1,        [r2 + 14]                  ; [00]
 
-    TRANSPOSE_STORE_8x8 1
-
-    lea         r0, [r6 + r1 * 4]
-    lea         r6, [r6 + r1 * 8]
-    add         r2, 8
+    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
+
+    lea         r0,        [r6 + r1 * 4]
+    lea         r6,        [r6 + r1 * 8]
+    add         r2,        8
     dec         r4
     jnz        .loop
 
     RET
 
-%macro STORE_8x8 4 ; rows 1-2, 3-4, 5-6, 7-8
-    movh        [r0         ], %1
-    movhps      [r0 + r1    ], %1
-    movh        [r0 + r1 * 2], %2
-    movhps      [r0 + r5    ], %2
-    lea         r0, [r0 + r1 * 4]
-    movh        [r0         ], %3
-    movhps      [r0 + r1    ], %3
-    movh        [r0 + r1 * 2], %4
-    movhps      [r0 + r5    ], %4
-%endmacro
-
 INIT_XMM sse4
 cglobal intra_pred_ang16_33, 3,7,8
-    mov         r2,  r3mp
-    lea         r3,  [ang_table + 16 * 16]
-    mov         r4d, 2
-    lea         r5,  [r1 * 3]
-    mov         r6,  r0
-    mova        m7,  [pw_1024]
+    mov         r2,        r3mp
+    lea         r3,        [ang_table + 16 * 16]
+    mov         r4d,       2
+    lea         r5,        [r1 * 3]
+    mov         r6,        r0
+    mova        m7,        [pw_1024]
 
 .loop:
     movu        m0,        [r2 + 1]
@@ -1576,53 +1561,44 @@
     punpcklbw   m0,        m1
     palignr     m1,        m2, m0, 2
 
-    movu        m3,        [r3 + 10 * 16]             ; [26]
-    movu        m6,        [r3 + 4 * 16]              ; [20]
-
-    pmaddubsw   m4,        m0, m3
+    pmaddubsw   m4,        m0, [r3 + 10 * 16]         ; [26]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 + 4 * 16]              ; [20]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
     palignr     m5,        m2, m0, 4
 
-    movu        m3,        [r3 - 2 * 16]              ; [14]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 - 2 * 16]              ; [14]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 6
 
-    movu        m3,        [r3 - 8 * 16]              ; [ 8]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 - 8 * 16]              ; [ 8]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 - 14 * 16]             ; [ 2]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 - 14 * 16]         ; [ 2]
     pmulhrsw    m6,        m7
 
-    movu        m3,        [r3 + 12 * 16]             ; [28]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 12 * 16]             ; [28]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 10
 
-    movu        m3,        [r3 + 6 * 16]              ; [22]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 6 * 16]              ; [22]
     pmulhrsw    m1,        m7
 
     palignr     m2,        m0, 12
 
-    movu        m3,        [r3]                       ; [16]
-    pmaddubsw   m2,        m3
+    pmaddubsw   m2,        [r3]                       ; [16]
     pmulhrsw    m2,        m7
     packuswb    m1,        m2
 
-    STORE_8x8 m4, m5, m6, m1
+    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
 
     movu        m0,        [r2 + 8]
     palignr     m1,        m0, 1
@@ -1631,49 +1607,40 @@
     punpcklbw   m0,        m1
     palignr     m5,        m2, m0, 2
 
-    movu        m3,        [r3 - 6 * 16]              ; [10]
-    movu        m6,        [r3 - 12 * 16]             ; [04]
-
-    pmaddubsw   m4,        m0, m3
+    pmaddubsw   m4,        m0, [r3 - 6 * 16]          ; [10]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m5, m6
+    pmaddubsw   m1,        m5, [r3 - 12 * 16]         ; [04]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
-    movu        m3,        [r3 + 14 * 16]             ; [30]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 + 14 * 16]             ; [30]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 4
 
-    movu        m3,        [r3 + 8 * 16]              ; [24]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 8 * 16]              ; [24]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 6
 
-    movu        m3,        [r3 + 2 * 16]              ; [18]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 + 2 * 16]          ; [18]
     pmulhrsw    m6,        m7
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 - 4 * 16]              ; [12]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 - 4 * 16]              ; [12]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 10
 
-    movu        m3,        [r3 - 10 * 16]             ; [06]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 - 10 * 16]             ; [06]
     pmulhrsw    m1,        m7
-
     packuswb    m1,        m1
+
     movh        m2,        [r2 + 14]                  ; [00]
 
-    lea         r0,        [r0 + r1 * 4]
     movh        [r0         ], m4
     movhps      [r0 + r1    ], m4
     movh        [r0 + r1 * 2], m5
@@ -1696,8 +1663,8 @@
 
     lea         r3,        [ang_table + 16 * 16]
     mov         r4d,       2
-    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
-    lea         r6,        [r0 + r1 * 4]       ; r6 -> 4 * stride
+    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
+    lea         r6,        [r0 + r1 * 4]              ; r6 -> 4 * stride
     mova        m7,        [pw_1024]
 
 .loop:
@@ -1709,55 +1676,44 @@
     palignr     m1,        m2, m0, 2
     mova        m5,        m1
 
-    movu        m3,        [r3 + 5 * 16]             ; [21]
-    movu        m6,        [r3 - 6 * 16]             ; [10]
-
-    pmaddubsw   m4,        m0, m3
+    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
-    movu        m3,        [r3 + 15 * 16]             ; [31]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 4
 
-    movu        m3,        [r3 + 4 * 16]              ; [ 20]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 6
 
-    movu        m3,        [r3 - 7 * 16]              ; [ 9]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
     pmulhrsw    m6,        m7
 
-    movu        m3,        [r3 + 14 * 16]             ; [30]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 + 3 * 16]              ; [19]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
     pmulhrsw    m1,        m7
 
     palignr     m2,        m0, 10
-    mova        m3,        m2
-
-    movu        m0,        [r3 - 8 * 16]              ; [8]
-    pmaddubsw   m2,        m0
-    pmulhrsw    m2,        m7
-    packuswb    m1,        m2
-
-    TRANSPOSE_STORE_8x8 0
-
-    movu        m0,        [r3 + 13 * 16]             ; [29]
-    pmaddubsw   m4,        m3, m0
+
+    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
+    pmulhrsw    m3,        m7
+    packuswb    m1,        m3
+
+    TRANSPOSE_STORE_8x8 0, 1, m4, m5, m6, m1
+
+    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
     pmulhrsw    m4,        m7
 
     movu        m0,        [r2 + 6]
@@ -1767,49 +1723,40 @@
     punpcklbw   m0,        m1
     palignr     m1,        m2, m0, 2
 
-    movu        m6,        [r3 +  2 * 16]            ; [18]
-
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
     palignr     m5,        m2, m0, 4
-    movu        m6,        m5
-
-    movu        m3,        [r3 - 9 * 16]             ; [07]
-    pmaddubsw   m5,        m3
+    mova        m6,        m5
+
+    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
     pmulhrsw    m5,        m7
 
-    movu        m3,        [r3 + 12 * 16]             ; [28]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m6,        m2, m0, 6
 
-    movu        m3,        [r3 +      16]             ; [17]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 +      16]             ; [17]
     pmulhrsw    m6,        m7
 
     palignr     m1,        m2, m0, 8
-
+    palignr     m2,        m0, 10
 
     pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
     pmulhrsw    m3,        m7
     packuswb    m6,        m3
 
-    movu        m3,        [r3 + 11 * 16]             ; [27]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
     pmulhrsw    m1,        m7
 
-    palignr     m2,        m0, 10
-
-    movu        m3,        [r3]                       ; [16]
-    pmaddubsw   m2,        m3
+    pmaddubsw   m2,        [r3]                       ; [16]
     pmulhrsw    m2,        m7
     packuswb    m1,        m2
 
-    TRANSPOSE_STORE_8x8 1
+    TRANSPOSE_STORE_8x8 1, 1, m4, m5, m6, m1
 
     lea         r0,        [r6 + r1 * 4]
     lea         r6,        [r6 + r1 * 8]
@@ -1824,7 +1771,7 @@
     mov         r2,        r3mp
     lea         r3,        [ang_table + 16 * 16]
     mov         r4d,       2
-    lea         r5,        [r1 * 3]            ; r5 -> 3 * stride
+    lea         r5,        [r1 * 3]                   ; r5 -> 3 * stride
     mov         r6,        r0
     mova        m7,        [pw_1024]
 
@@ -1837,55 +1784,45 @@
     palignr     m1,        m2, m0, 2
     mova        m5,        m1
 
-    movu        m3,        [r3 + 5 * 16]             ; [21]
-    movu        m6,        [r3 - 6 * 16]             ; [10]
-
-    pmaddubsw   m4,        m0, m3
+
+    pmaddubsw   m4,        m0, [r3 + 5 * 16]          ; [21]
     pmulhrsw    m4,        m7
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 - 6 * 16]              ; [10]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
-    movu        m3,        [r3 + 15 * 16]             ; [31]
-    pmaddubsw   m5,        m3
+    pmaddubsw   m5,        [r3 + 15 * 16]             ; [31]
     pmulhrsw    m5,        m7
 
     palignr     m6,        m2, m0, 4
 
-    movu        m3,        [r3 + 4 * 16]              ; [ 20]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 4 * 16]              ; [ 20]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m1,        m2, m0, 6
 
-    movu        m3,        [r3 - 7 * 16]              ; [ 9]
-    pmaddubsw   m6,        m1, m3
+    pmaddubsw   m6,        m1, [r3 - 7 * 16]          ; [ 9]
     pmulhrsw    m6,        m7
 
-    movu        m3,        [r3 + 14 * 16]             ; [30]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 14 * 16]             ; [30]
     pmulhrsw    m1,        m7
     packuswb    m6,        m1
 
     palignr     m1,        m2, m0, 8
 
-    movu        m3,        [r3 + 3 * 16]              ; [19]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 3 * 16]              ; [19]
     pmulhrsw    m1,        m7
 
     palignr     m2,        m0, 10
-    mova        m3,        m2
-
-    movu        m0,        [r3 - 8 * 16]              ; [8]
-    pmaddubsw   m2,        m0
-    pmulhrsw    m2,        m7
-    packuswb    m1,        m2
-
-    STORE_8x8 m4, m5, m6, m1
-
-    movu        m0,        [r3 + 13 * 16]             ; [29]
-    pmaddubsw   m4,        m3, m0
+
+    pmaddubsw   m3,        m2, [r3 - 8 * 16]          ; [8]
+    pmulhrsw    m3,        m7
+    packuswb    m1,        m3
+
+    TRANSPOSE_STORE_8x8 0, 0, m4, m5, m6, m1
+
+    pmaddubsw   m4,        m2, [r3 + 13 * 16]         ; [29]
     pmulhrsw    m4,        m7
 
     movu        m0,        [r2 + 6]
@@ -1895,50 +1832,40 @@
     punpcklbw   m0,        m1
     palignr     m1,        m2, m0, 2
 
-    movu        m6,        [r3 +  2 * 16]            ; [18]
-
-    pmaddubsw   m1,        m6
+    pmaddubsw   m1,        [r3 +  2 * 16]             ; [18]
     pmulhrsw    m1,        m7
     packuswb    m4,        m1
 
     palignr     m5,        m2, m0, 4
-    movu        m6,        m5
-
-    movu        m3,        [r3 - 9 * 16]             ; [07]
-    pmaddubsw   m5,        m3
+    mova        m6,        m5
+
+    pmaddubsw   m5,        [r3 - 9 * 16]              ; [07]
     pmulhrsw    m5,        m7
 
-    movu        m3,        [r3 + 12 * 16]             ; [28]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 + 12 * 16]             ; [28]
     pmulhrsw    m6,        m7
     packuswb    m5,        m6
 
     palignr     m6,        m2, m0, 6
 
-    movu        m3,        [r3 +      16]             ; [17]
-    pmaddubsw   m6,        m3
+    pmaddubsw   m6,        [r3 +      16]             ; [17]
     pmulhrsw    m6,        m7
 
     palignr     m1,        m2, m0, 8
-
+    palignr     m2,        m0, 10
 
     pmaddubsw   m3,        m1, [r3 - 10 * 16]         ; [06]
     pmulhrsw    m3,        m7
     packuswb    m6,        m3
 
-    movu        m3,        [r3 + 11 * 16]             ; [27]
-    pmaddubsw   m1,        m3
+    pmaddubsw   m1,        [r3 + 11 * 16]             ; [27]
     pmulhrsw    m1,        m7
 
-    palignr     m2,        m0, 10
-
-    movu        m3,        [r3]                       ; [16]
-    pmaddubsw   m2,        m3
+    pmaddubsw   m2,        [r3]                       ; [16]
     pmulhrsw    m2,        m7
     packuswb    m1,        m2
 
-    lea         r0,        [r0 + r1 * 4]
-    STORE_8x8 m4, m5, m6, m1
+    TRANSPOSE_STORE_8x8 1, 0, m4, m5, m6, m1
 
     lea         r0,        [r6 + 8]
     add         r2,        8


More information about the x265-devel mailing list