[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Sat Apr 11 03:58:38 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1428717487 25200
# Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
# Parent  ee76a15fa312ac59549965821d9cbff03237226f
asm: intra pred all_angs_pred_4x4 sse2

This replaces c code and is backported from sse4
The processing of modes 10 and 26 were merged and moved to after mode 2

64-bit

./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4	9.99x 	 6449.98  	 64435.56

32-bit

./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4	13.31x 	 6512.49  	 86709.86

diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 10 18:58:07 2015 -0700
@@ -1259,6 +1259,8 @@
         p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
         p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
 
+        p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
+
         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
 
diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/const-a.asm	Fri Apr 10 18:58:07 2015 -0700
@@ -53,6 +53,10 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+const pb_0000000000000F0F,  times 2  db 0xff, 0x00
+                            times 14 db 0x00
+const pb_000000000000000F,           db 0xff
+                            times 15 db 0x00
 
 ;; 16-bit constants
 
@@ -94,6 +98,8 @@
 const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
+const pw_FFFFFFF0,                   dw 0x00
+                            times 7  dw 0xff
 
 
 ;; 32-bit constants
diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/intrapred.h	Fri Apr 10 18:58:07 2015 -0700
@@ -275,6 +275,7 @@
 void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 10:24:55 2015 -0500
+++ b/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 18:58:07 2015 -0700
@@ -34,10 +34,17 @@
 
 ; common constant with intrapred8.asm
 cextern ang_table
+cextern pw_ang_table
 cextern tab_S1
 cextern tab_S2
 cextern tab_Si
 
+; constants from const-a.asm
+cextern pw_16
+cextern pb_000000000000000F
+cextern pb_0000000000000F0F
+cextern pw_FFFFFFF0
+
 
 ;-----------------------------------------------------------------------------
 ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
@@ -23006,3 +23013,780 @@
     palignr    m4,              m2,       m1,    14
     movu       [r0 + 2111 * 16],   m4
     RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal all_angs_pred_4x4, 4, 4, 8
+
+; mode 2
+
+    movh        m6,             [r1 + 9]
+    movh        m2,             m6
+    psrldq      m2,             1
+    movd        [r0],           m2
+    psrldq      m2,             1
+    movd        [r0 + 4],       m2
+    psrldq      m2,             1
+    movd        [r0 + 8],       m2
+    psrldq      m2,             1
+    movd        [r0 + 12],      m2
+
+; mode 10/26
+
+    pxor        m7,             m7
+    pshufd      m5,             m6,        0
+    movu        [r0 + 128],     m5              ;mode 10
+
+    movd        m4,             [r1 + 1]
+    pshufd      m4,             m4,        0
+    movu        [r0 + 384],     m4              ;mode 26
+
+    movd        m1,             [r1]
+    punpcklbw   m1,             m7
+    pshuflw     m1,             m1,     0x00
+    punpcklqdq  m1,             m1
+
+    punpckldq   m4,             m5
+    punpcklbw   m4,             m7
+    pshuflw     m2,             m4,     0x00
+    pshufhw     m2,             m2,     0x00
+
+    psubw       m4,             m1
+    psraw       m4,             1
+
+    pshufd      m2,             m2,     q1032
+    paddw       m4,             m2
+    packuswb    m4,             m4
+
+%if ARCH_X86_64
+    movq        r2,             m4
+
+    mov         [r0 + 128],     r2b              ;mode 10
+    shr         r2,             8
+    mov         [r0 + 132],     r2b
+    shr         r2,             8
+    mov         [r0 + 136],     r2b
+    shr         r2,             8
+    mov         [r0 + 140],     r2b
+    shr         r2,             8
+    mov         [r0 + 384],     r2b              ;mode 26
+    shr         r2d,            8
+    mov         [r0 + 388],     r2b
+    shr         r2d,            8
+    mov         [r0 + 392],     r2b
+    shr         r2d,            8
+    mov         [r0 + 396],     r2b
+
+%else
+    movd        r2d,             m4
+
+    mov         [r0 + 128],     r2b              ;mode 10
+    shr         r2d,             8
+    mov         [r0 + 132],     r2b
+    shr         r2d,             8
+    mov         [r0 + 136],     r2b
+    shr         r2d,             8
+    mov         [r0 + 140],     r2b
+
+    psrldq      m4,             4
+    movd        r2d,            m4
+
+    mov         [r0 + 384],     r2b              ;mode 26
+    shr         r2d,            8
+    mov         [r0 + 388],     r2b
+    shr         r2d,            8
+    mov         [r0 + 392],     r2b
+    shr         r2d,            8
+    mov         [r0 + 396],     r2b
+%endif
+
+; mode 3
+
+    mova        m2,             [pw_16]
+    lea         r3,             [pw_ang_table]
+
+    punpcklbw   m6,             m6
+    psrldq      m6,             1
+    movh        m1,             m6
+    psrldq      m6,             2
+    movh        m0,             m6
+    psrldq      m6,             2
+    movh        m3,             m6
+    psrldq      m6,             2
+    punpcklbw   m1,             m7
+    punpcklbw   m0,             m7
+    punpcklbw   m3,             m7
+    punpcklbw   m6,             m7
+
+    mova        m7,             [r3 + 20 * 16]
+
+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
+    pmaddwd     m4,             m0,     m7
+
+    packssdw    m5,             m4
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m3,     [r3 + 14 * 16]
+    pmaddwd     m6,             [r3 + 8 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 16],      m5
+    movd        [r0 + 68],      m5                      ;mode 6 row 1
+    psrldq      m5,             4
+    movd        [r0 + 76],      m5                      ;mode 6 row 3
+
+; mode 4
+
+    pmaddwd     m4,             m0,     [r3 + 31 * 16]
+    pmaddwd     m6,             m3,     m7
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m5,             m1,     [r3 + 21 * 16]
+    pmaddwd     m6,             m0,     [r3 + 10 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 32],      m5
+
+; mode 5
+
+    pmaddwd     m5,             m1,     [r3 + 17 * 16]
+    pmaddwd     m6,             m0,     [r3 + 2 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m0,     [r3 + 19 * 16]
+    pmaddwd     m3,             [r3 + 4 * 16]
+
+    packssdw    m4,             m3
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 48],      m5
+
+; mode 6
+
+    pmaddwd     m5,             m1,     [r3 + 13 * 16]
+    pmaddwd     m6,             m0,     [r3 + 7 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m5,             m6
+    movd        [r0 + 64],      m5
+    psrldq      m5,             4
+    movd        [r0 + 72],      m5
+
+; mode 7
+
+    pmaddwd     m5,             m1,     [r3 + 9 * 16]
+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    mova        m3,             [r3 + 27 * 16]
+    pmaddwd     m4,             m1,     m3
+    pmaddwd     m0,             [r3 + 4 * 16]
+
+    packssdw    m4,             m0
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 80],      m5
+
+; mode 8
+
+    mova        m0,             [r3 + 5 * 16]
+    pmaddwd     m5,             m1,     m0
+    pmaddwd     m6,             m1,     [r3 + 10 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
+    pmaddwd     m7,             m1
+
+    packssdw    m4,             m7
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 96],      m5
+
+; mode 9
+
+    pmaddwd     m5,             m1,     [r3 + 2 * 16]
+    pmaddwd     m6,             m1,     [r3 + 4 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r3 + 6 * 16]
+    pmaddwd     m6,             m1,     [r3 + 8 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 112],     m5
+
+; mode 11
+
+    movd        m5,             [r1]
+    punpcklwd   m5,             m1
+    pand        m5,             [pb_0000000000000F0F]
+    pslldq      m1,             4
+    por         m1,             m5
+
+    pmaddwd     m5,             m1,     [r3 + 30 * 16]
+    pmaddwd     m6,             m1,     [r3 + 28 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r3 + 26 * 16]
+    pmaddwd     m6,             m1,     [r3 + 24 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 144],     m5
+
+; mode 12
+
+    pmaddwd     m3,             m1
+    pmaddwd     m6,             m1,     [r3 + 22 * 16]
+
+    packssdw    m3,             m6
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m3,             m4
+    mova        [r0 + 160],     m3
+
+; mode 13
+
+    mova        m3,             m1
+    movd        m7,             [r1 + 4]
+    punpcklwd   m7,             m1
+    pand        m7,             [pb_0000000000000F0F]
+    pslldq      m3,             4
+    por         m3,             m7
+
+    pmaddwd     m5,             m1,     [r3 + 23 * 16]
+    pmaddwd     m6,             m1,     [r3 + 14 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     m0
+    pmaddwd     m6,             m3,     [r3 + 28 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 176],     m5
+
+; mode 14
+
+    pmaddwd     m5,             m1,     [r3 + 19 * 16]
+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    movd        m6,             [r1 + 2]
+    pand        m3,             [pw_FFFFFFF0]
+    pand        m6,             [pb_000000000000000F]
+    por         m3,             m6
+
+    pmaddwd     m4,             m3,     [r3 + 25 * 16]
+    pmaddwd     m6,             m3,     [r3 + 12 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 192],     m5
+    psrldq      m5,             4
+    movd        [r0 + 240],     m5                              ;mode 17 row 0
+
+; mode 15
+
+    pmaddwd     m5,             m1,     [r3 + 15 * 16]
+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m6,             m3,     [r3 + 13 * 16]
+
+    mova        m0,             m3
+    punpcklwd   m7,             m3
+    pslldq      m0,             4
+    pand        m7,             [pb_0000000000000F0F]
+    por         m0,             m7
+
+    pmaddwd     m4,             m0,     [r3 + 28 * 16]
+
+    packssdw    m6,             m4
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m5,             m6
+    mova        [r0 + 208],     m5
+
+; mode 16
+
+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m3,             [r3 + 1 * 16]
+
+    movd        m6,             [r1 + 3]
+    pand        m0,             [pw_FFFFFFF0]
+    pand        m6,             [pb_000000000000000F]
+    por         m0,             m6
+
+    pmaddwd     m0,             [r3 + 12 * 16]
+    packssdw    m3,             m0
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    packuswb    m5,             m3
+    mova        [r0 + 224],     m5
+
+; mode 17
+
+    movd        m4,             [r1 + 1]
+    punpcklwd   m4,             m1
+    pand        m4,             [pb_0000000000000F0F]
+    pslldq      m1,             4
+    por         m1,             m4
+
+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
+
+    packssdw    m6,             m6
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    movh        m5,             [r1 + 2]
+    punpcklwd   m5,             m1
+    pand        m5,             [pb_0000000000000F0F]
+    pslldq      m1,             4
+    por         m1,             m5
+
+    pmaddwd     m4,             m1,     [r3 + 18 * 16]
+
+    punpcklwd   m7,             m1
+    pand        m7,             [pb_0000000000000F0F]
+    pslldq      m1,             4
+    por         m1,             m7
+
+    pmaddwd     m1,             [r3 + 24 * 16]
+    packssdw    m4,             m1
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m6,             m4
+    movd        [r0 + 244],     m6
+    psrldq      m6,             8
+    movh        [r0 + 248],     m6
+
+; mode 18
+
+    movh        m1,             [r1]
+    movd        [r0 + 256],     m1
+
+    movh        m3,             [r1 + 2]
+    punpcklqdq  m3,             m1
+    psrldq      m3,             7
+    movd        [r0 + 260],     m3
+
+    movh        m4,             [r1 + 3]
+    punpcklqdq  m4,             m3
+    psrldq      m4,             7
+    movd        [r0 + 264],     m4
+
+    movh        m0,             [r1 + 4]
+    punpcklqdq  m0,             m4
+    psrldq      m0,             7
+    movd        [r0 + 268],     m0
+
+; mode 19
+
+    pxor        m7,             m7
+    punpcklbw   m4,             m3
+    punpcklbw   m3,             m1
+    punpcklbw   m1,             m1
+    punpcklbw   m4,             m7
+    punpcklbw   m3,             m7
+    psrldq      m1,             1
+    punpcklbw   m1,             m7
+
+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
+    pmaddwd     m7,             m3,     [r3 + 12 * 16]
+
+    packssdw    m6,             m7
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    pmaddwd     m5,             m4,     [r3 + 18 * 16]
+
+    movd        m7,             [r1 + 12]
+    punpcklwd   m7,             m4
+    pand        m7,             [pb_0000000000000F0F]
+    pslldq      m4,             4
+    por         m4,             m7
+
+    pmaddwd     m4,             [r3 + 24 * 16]
+    packssdw    m5,             m4
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m6,             m5
+    mova        [r0 + 272],     m6
+    movd        [r0 + 324],     m6                              ;mode 22 row 1
+
+; mode 20
+
+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
+
+    movd        m4,             [r1 + 10]
+    pand        m3,             [pw_FFFFFFF0]
+    pand        m4,             [pb_000000000000000F]
+    por         m3,             m4
+
+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m3,     [r3 + 1 * 16]
+
+    punpcklwd   m0,             m3
+    pand        m0,             [pb_0000000000000F0F]
+    mova        m6,             m3
+    pslldq      m6,             4
+    por         m0,             m6
+
+    pmaddwd     m6,             m0,     [r3 + 12 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 288],     m5
+
+; mode 21
+
+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m5,             m3,     [r3 + 13 * 16]
+
+    pand        m0,             [pw_FFFFFFF0]
+    pand        m7,             [pb_000000000000000F]
+    por         m0,             m7
+
+    pmaddwd     m0,             [r3 + 28 * 16]
+    packssdw    m5,             m0
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m4,             m5
+    mova        [r0 + 304],     m4
+
+; mode 22
+
+    pmaddwd     m4,             m1,     [r3 + 19 * 16]
+    packssdw    m4,             m4
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    mova        m0,             [r3 + 12 * 16]
+    pmaddwd     m5,             m3,     [r3 + 25 * 16]
+    pmaddwd     m6,             m3,     m0
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m4,             m5
+    movd        [r0 + 320],     m4
+    psrldq      m4,             8
+    movh        [r0 + 328],     m4
+
+; mode 23
+
+    pmaddwd     m4,             m1,     [r3 + 23 * 16]
+    pmaddwd     m5,             m1,     [r3 + 14 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 5 * 16]
+
+    pand        m3,             [pw_FFFFFFF0]
+    por         m3,             m7
+
+    pmaddwd     m3,             [r3 + 28 * 16]
+    packssdw    m6,             m3
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 336],     m4
+
+; mode 24
+
+    pmaddwd     m4,             m1,     [r3 + 27 * 16]
+    pmaddwd     m5,             m1,     [r3 + 22 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 17 * 16]
+    pmaddwd     m0,             m1
+
+    packssdw    m6,             m0
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 352],     m4
+
+; mode 25
+
+    pmaddwd     m4,             m1,     [r3 + 30 * 16]
+    pmaddwd     m5,             m1,     [r3 + 28 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 26 * 16]
+    pmaddwd     m1,             [r3 + 24 * 16]
+
+    packssdw    m6,             m1
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 368],     m4
+
+; mode 27
+
+    movh        m0,             [r1 + 1]
+    pxor        m7,             m7
+    punpcklbw   m0,             m0
+    psrldq      m0,             1
+    movh        m1,             m0
+    psrldq      m0,             2
+    movh        m3,             m0
+    psrldq      m0,             2
+    punpcklbw   m1,             m7
+    punpcklbw   m3,             m7
+    punpcklbw   m0,             m7
+
+    mova        m7,             [r3 + 4 * 16]
+
+    pmaddwd     m4,             m1,     [r3 + 2 * 16]
+    pmaddwd     m5,             m1,     m7
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
+    pmaddwd     m5,             m1,     [r3 + 8 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 400],     m4
+
+; mode 28
+
+    pmaddwd     m4,             m1,     [r3 + 5 * 16]
+    pmaddwd     m5,             m1,     [r3 + 10 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 15 * 16]
+    pmaddwd     m5,             m1,     [r3 + 20 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 416],     m4
+
+; mode 29
+
+    pmaddwd     m4,             m1,     [r3 + 9 * 16]
+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 27 * 16]
+    pmaddwd     m5,             m3,     m7
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 432],     m4
+
+; mode 30
+
+    pmaddwd     m4,             m1,     [r3 + 13 * 16]
+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m3,     [r3 + 7 * 16]
+    pmaddwd     m5,             m3,     [r3 + 20 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 448],     m4
+    psrldq      m4,             4
+    movh        [r0 + 496],     m4                      ;mode 33 row 0
+    psrldq      m4,             8
+    movd        [r0 + 500],     m4                      ;mode 33 row 1
+
+; mode 31
+
+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
+    pmaddwd     m5,             m3,     [r3 + 2 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m3,     [r3 + 19 * 16]
+    pmaddwd     m7,             m0;,     [r3 + 4 * 16]
+
+    packssdw    m6,             m7
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 464],     m4
+
+; mode 32
+
+    pmaddwd     m1,             [r3 + 21 * 16]
+    pmaddwd     m5,             m3,     [r3 + 10 * 16]
+
+    packssdw    m1,             m5
+    paddw       m1,             m2
+    psraw       m1,             5
+
+    pmaddwd     m3,             [r3 + 31 * 16]
+    pmaddwd     m5,             m0,     [r3 + 20 * 16]
+    packssdw    m3,             m5
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    packuswb    m1,             m3
+    mova        [r0 + 480],     m1
+
+; mode 33
+
+    pmaddwd     m0,             [r3 + 14 * 16]
+    pxor        m7,             m7
+    movh        m4,             [r1 + 4]
+    punpcklbw   m4,             m4
+    psrldq      m4,             1
+    punpcklbw   m4,             m7
+
+    pmaddwd     m4,             [r3 + 8 * 16]
+
+    packssdw    m0,             m4
+    paddw       m0,             m2
+    psraw       m0,             5
+
+    packuswb    m0,             m0
+    movh        [r0 + 504],     m0
+
+; mode 34
+
+    movh        m7,             [r1 + 2]
+    movd        [r0 + 512],     m7
+
+    psrldq      m7,             1
+    movd        [r0 + 516],     m7
+
+    psrldq      m7,             1
+    movd        [r0 + 520],     m7
+
+    psrldq      m7,             1
+    movd        [r0 + 524],     m7
+
+RET
\ No newline at end of file


More information about the x265-devel mailing list