[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2

dtyx265 at gmail.com dtyx265 at gmail.com
Tue Apr 14 03:38:21 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1428958891 25200
# Node ID 9a581851fd66679eca3175921b6eef428cdec1ce
# Parent  4cccf22b00ee188a72c8dc3896d7dc1613d855ad
asm: intra pred all_angs_pred_4x4 sse2

This replaces c code and is backported from sse4
The processing of modes 10 and 26 were merged and moved to after mode 2

The new constants are declared with explicit long form names

64-bit

./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4	9.89x 	 6434.99  	 63671.87

32-bit

./test/TestBench --testbench intrapred | grep intra_allangs4x4
intra_allangs4x4	13.38x 	 6497.50  	 86943.55

diff -r 4cccf22b00ee -r 9a581851fd66 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon Apr 13 14:01:31 2015 -0700
@@ -1259,6 +1259,8 @@
         p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
         p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
 
+        p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
+
         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
 
diff -r 4cccf22b00ee -r 9a581851fd66 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/const-a.asm	Mon Apr 13 14:01:31 2015 -0700
@@ -53,6 +53,10 @@
 const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
 const pb_movemask,          times 16 db 0x00
                             times 16 db 0xFF
+const pb_00000000000000000000000000FF00FF,      times 2 db 0xff, 0x00
+                            times 12 db 0x00
+const pb_000000000000000000000000000000FF,              db 0xff
+                            times 15 db 0x00
 
 ;; 16-bit constants
 
@@ -94,6 +98,8 @@
 const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
+const pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000,      dw 0x00
+                            times 7  dw 0xff
 
 
 ;; 32-bit constants
diff -r 4cccf22b00ee -r 9a581851fd66 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred.h	Mon Apr 13 14:01:31 2015 -0700
@@ -277,6 +277,7 @@
 void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r 4cccf22b00ee -r 9a581851fd66 source/common/x86/intrapred8_allangs.asm
--- a/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 18:15:38 2015 -0500
+++ b/source/common/x86/intrapred8_allangs.asm	Mon Apr 13 14:01:31 2015 -0700
@@ -34,9 +34,14 @@
 
 ; common constant with intrapred8.asm
 cextern ang_table
+cextern pw_ang_table
 cextern tab_S1
 cextern tab_S2
 cextern tab_Si
+cextern pw_16
+cextern pb_000000000000000000000000000000FF
+cextern pb_00000000000000000000000000FF00FF
+cextern pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000
 
 
 ;-----------------------------------------------------------------------------
@@ -23006,3 +23011,780 @@
     palignr    m4,              m2,       m1,    14
     movu       [r0 + 2111 * 16],   m4
     RET
+
+;-----------------------------------------------------------------------------
+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal all_angs_pred_4x4, 4, 4, 8
+
+; mode 2
+
+    movh        m6,             [r1 + 9]
+    mova        m2,             m6
+    psrldq      m2,             1
+    movd        [r0],           m2              ;byte[A, B, C, D]
+    psrldq      m2,             1
+    movd        [r0 + 4],       m2              ;byte[B, C, D, E]
+    psrldq      m2,             1
+    movd        [r0 + 8],       m2              ;byte[C, D, E, F]
+    psrldq      m2,             1
+    movd        [r0 + 12],      m2              ;byte[D, E, F, G]
+
+; mode 10/26
+
+    pxor        m7,             m7
+    pshufd      m5,             m6,        0
+    mova        [r0 + 128],     m5              ;mode 10 byte[9, A, B, C, 9, A, B, C, 9, A, B, C, 9, A, B, C]
+
+    movd        m4,             [r1 + 1]
+    pshufd      m4,             m4,        0
+    mova        [r0 + 384],     m4              ;mode 26 byte[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]
+
+    movd        m1,             [r1]
+    punpcklbw   m1,             m7
+    pshuflw     m1,             m1,     0x00
+    punpcklqdq  m1,             m1              ;m1 = byte[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+
+    punpckldq   m4,             m5
+    punpcklbw   m4,             m7              ;m4 = word[1, 2, 3, 4, 9, A, B, C]
+    pshuflw     m2,             m4,     0x00
+    pshufhw     m2,             m2,     0x00    ;m2 = word[1, 1, 1, 1, 9, 9, 9, 9]
+
+    psubw       m4,             m1
+    psraw       m4,             1
+
+    pshufd      m2,             m2,     q1032   ;m2 = word[9, 9, 9, 9, 1, 1, 1, 1]
+    paddw       m4,             m2
+    packuswb    m4,             m4
+
+%if ARCH_X86_64
+    movq        r2,             m4
+
+    mov         [r0 + 128],     r2b              ;mode 10
+    shr         r2,             8
+    mov         [r0 + 132],     r2b
+    shr         r2,             8
+    mov         [r0 + 136],     r2b
+    shr         r2,             8
+    mov         [r0 + 140],     r2b
+    shr         r2,             8
+    mov         [r0 + 384],     r2b              ;mode 26
+    shr         r2d,            8
+    mov         [r0 + 388],     r2b
+    shr         r2d,            8
+    mov         [r0 + 392],     r2b
+    shr         r2d,            8
+    mov         [r0 + 396],     r2b
+
+%else
+    movd        r2d,             m4
+
+    mov         [r0 + 128],     r2b              ;mode 10
+    shr         r2d,             8
+    mov         [r0 + 132],     r2b
+    shr         r2d,             8
+    mov         [r0 + 136],     r2b
+    shr         r2d,             8
+    mov         [r0 + 140],     r2b
+
+    psrldq      m4,             4
+    movd        r2d,            m4
+
+    mov         [r0 + 384],     r2b              ;mode 26
+    shr         r2d,            8
+    mov         [r0 + 388],     r2b
+    shr         r2d,            8
+    mov         [r0 + 392],     r2b
+    shr         r2d,            8
+    mov         [r0 + 396],     r2b
+%endif
+
+; mode 3
+
+    mova        m2,             [pw_16]
+    lea         r3,             [pw_ang_table + 7 * 16]
+    lea         r2,             [pw_ang_table + 23 * 16]
+    punpcklbw   m6,             m6
+    psrldq      m6,             1
+    movh        m1,             m6
+    psrldq      m6,             2
+    movh        m0,             m6
+    psrldq      m6,             2
+    movh        m3,             m6
+    psrldq      m6,             2
+    punpcklbw   m1,             m7              ;m1 = word[9, A, A, B, B, C, C, D]
+    punpcklbw   m0,             m7              ;m0 = word[A, B, B, C, C, D, D, E]
+    punpcklbw   m3,             m7              ;m3 = word[B, C, C, D, D, E, E, F]
+    punpcklbw   m6,             m7              ;m6 = word[C, D, D, E, E, F, F, G]
+
+    mova        m7,             [r2 - 3 * 16]
+
+    pmaddwd     m5,             m1,     [r2 + 3 * 16]
+    pmaddwd     m4,             m0,     m7
+
+    packssdw    m5,             m4
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m3,     [r3 + 7 * 16]
+    pmaddwd     m6,             [r3 + 1 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 16],      m5
+    movd        [r0 + 68],      m5              ;mode 6 row 1
+    psrldq      m5,             4
+    movd        [r0 + 76],      m5              ;mode 6 row 3
+
+; mode 4
+
+    pmaddwd     m4,             m0,     [r2 + 8 * 16]
+    pmaddwd     m6,             m3,     m7
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m5,             m1,     [r2 - 2 * 16]
+    pmaddwd     m6,             m0,     [r3 + 3 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 32],      m5
+
+; mode 5
+
+    pmaddwd     m5,             m1,     [r2 - 6 * 16]
+    pmaddwd     m6,             m0,     [r3 - 5 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m0,     [r2 - 4 * 16]
+    pmaddwd     m3,             [r3 - 3 * 16]
+
+    packssdw    m4,             m3
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 48],      m5
+
+; mode 6
+
+    pmaddwd     m5,             m1,     [r3 + 6 * 16]
+    pmaddwd     m6,             m0,     [r3 + 0 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m5,             m6
+    movd        [r0 + 64],      m5
+    psrldq      m5,             4
+    movd        [r0 + 72],      m5
+
+; mode 7
+
+    pmaddwd     m5,             m1,     [r3 + 2 * 16]
+    pmaddwd     m6,             m1,     [r2 - 5 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    mova        m3,             [r2 + 4 * 16]
+    pmaddwd     m4,             m1,     m3
+    pmaddwd     m0,             [r3 - 3 * 16]
+
+    packssdw    m4,             m0
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 80],      m5
+
+; mode 8
+
+    mova        m0,             [r3 - 2 * 16]
+    pmaddwd     m5,             m1,     m0
+    pmaddwd     m6,             m1,     [r3 + 3 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r3 + 8 * 16]
+    pmaddwd     m7,             m1
+
+    packssdw    m4,             m7
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 96],      m5
+
+; mode 9
+
+    pmaddwd     m5,             m1,     [r3 - 5 * 16]
+    pmaddwd     m6,             m1,     [r3 - 3 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r3 - 1 * 16]
+    pmaddwd     m6,             m1,     [r3 + 1 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 112],     m5
+
+; mode 11
+
+    movd        m5,             [r1]
+    punpcklwd   m5,             m1
+    pand        m5,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m1,             4
+    por         m1,             m5              ;m1 = word[0, 9, 9, A, A, B, B, C]
+
+    pmaddwd     m5,             m1,     [r2 + 7 * 16]
+    pmaddwd     m6,             m1,     [r2 + 5 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     [r2 + 3 * 16]
+    pmaddwd     m6,             m1,     [r2 + 1 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 144],     m5
+
+; mode 12
+
+    pmaddwd     m3,             m1
+    pmaddwd     m6,             m1,     [r2 - 1 * 16]
+
+    packssdw    m3,             m6
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    pmaddwd     m4,             m1,     [r2 - 6 * 16]
+    pmaddwd     m6,             m1,     [r3 + 5 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m3,             m4
+    mova        [r0 + 160],     m3
+
+; mode 13
+
+    mova        m3,             m1
+    movd        m7,             [r1 + 4]
+    punpcklwd   m7,             m1
+    pand        m7,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m3,             4
+    por         m3,             m7              ;m3 = word[4, 0, 0, 9, 9, A, A, B]
+
+    pmaddwd     m5,             m1,     [r2 + 0 * 16]
+    pmaddwd     m6,             m1,     [r3 + 7 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m1,     m0
+    pmaddwd     m6,             m3,     [r2 + 5 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 176],     m5
+
+; mode 14
+
+    pmaddwd     m5,             m1,     [r2 - 4 * 16]
+    pmaddwd     m6,             m1,     [r3 - 1 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    movd        m6,             [r1 + 2]
+    pand        m3,             [pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000]
+    pand        m6,             [pb_000000000000000000000000000000FF]
+    por         m3,             m6              ;m3 = word[2, 0, 0, 9, 9, A, A, B]
+
+    pmaddwd     m4,             m3,     [r2 + 2 * 16]
+    pmaddwd     m6,             m3,     [r3 + 5 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 192],     m5
+    psrldq      m5,             4
+    movd        [r0 + 240],     m5              ;mode 17 row 0
+
+; mode 15
+
+    pmaddwd     m5,             m1,     [r3 + 8 * 16]
+    pmaddwd     m6,             m3,     [r2 + 7 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m6,             m3,     [r3 + 6 * 16]
+
+    mova        m0,             m3
+    punpcklwd   m7,             m3
+    pslldq      m0,             4
+    pand        m7,             [pb_00000000000000000000000000FF00FF]
+    por         m0,             m7              ;m0 = word[4, 2, 2, 0, 0, 9, 9, A]
+
+    pmaddwd     m4,             m0,     [r2 + 5 * 16]
+
+    packssdw    m6,             m4
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m5,             m6
+    mova        [r0 + 208],     m5
+
+; mode 16
+
+    pmaddwd     m5,             m1,     [r3 + 4 * 16]
+    pmaddwd     m6,             m3,     [r2 - 1 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m3,             [r3 - 6 * 16]
+
+    movd        m6,             [r1 + 3]
+    pand        m0,             [pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000]
+    pand        m6,             [pb_000000000000000000000000000000FF]
+    por         m0,             m6              ;m0 = word[3, 2, 2, 0, 0, 9, 9, A]
+
+    pmaddwd     m0,             [r3 + 5 * 16]
+    packssdw    m3,             m0
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    packuswb    m5,             m3
+    mova        [r0 + 224],     m5
+
+; mode 17
+
+    movd        m4,             [r1 + 1]
+    punpcklwd   m4,             m1
+    pand        m4,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m1,             4
+    por         m1,             m4              ;m1 = word[1, 0, 0, 9, 9, A, A, B]
+
+    pmaddwd     m6,             m1,     [r3 + 5 * 16]
+
+    packssdw    m6,             m6
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    movd        m5,             [r1 + 2]
+    punpcklwd   m5,             m1
+    pand        m5,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m1,             4
+    por         m1,             m5              ;m1 = word[2, 1, 1, 0, 0, 9, 9, A]
+
+    pmaddwd     m4,             m1,     [r2 - 5 * 16]
+
+    punpcklwd   m7,             m1
+    pand        m7,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m1,             4
+    por         m1,             m7              ;m1 = word[4, 2, 2, 1, 1, 0, 0, 9]
+
+    pmaddwd     m1,             [r2 + 1 * 16]
+    packssdw    m4,             m1
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m6,             m4
+    movd        [r0 + 244],     m6
+    psrldq      m6,             8
+    movh        [r0 + 248],     m6
+
+; mode 18
+
+    movh        m1,             [r1]
+    movd        [r0 + 256],     m1              ;byte[0, 1, 2, 3]
+
+    movh        m3,             [r1 + 2]
+    punpcklqdq  m3,             m1
+    psrldq      m3,             7
+    movd        [r0 + 260],     m3              ;byte[2, 1, 0, 9]
+
+    movh        m4,             [r1 + 3]
+    punpcklqdq  m4,             m3
+    psrldq      m4,             7
+    movd        [r0 + 264],     m4              ;byte[1, 0, 9, A]
+
+    movh        m0,             [r1 + 4]
+    punpcklqdq  m0,             m4
+    psrldq      m0,             7
+    movd        [r0 + 268],     m0              ;byte[0, 9, A, B]
+
+; mode 19
+
+    pxor        m7,             m7
+    punpcklbw   m4,             m3
+    punpcklbw   m3,             m1
+    punpcklbw   m1,             m1
+    punpcklbw   m4,             m7              ;m4 = word[A, 9, 9, 0, 0, 1, 1, 2]
+    punpcklbw   m3,             m7              ;m3 = word[9, 0, 0, 1, 1, 2, 2, 3]
+    psrldq      m1,             1
+    punpcklbw   m1,             m7              ;m1 = word[0, 1, 1, 2, 2, 3, 3, 4]
+
+    pmaddwd     m6,             m1,     [r3 - 1 * 16]
+    pmaddwd     m7,             m3,     [r3 + 5 * 16]
+
+    packssdw    m6,             m7
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    pmaddwd     m5,             m4,     [r2 - 5 * 16]
+
+    movd        m7,             [r1 + 12]
+    punpcklwd   m7,             m4
+    pand        m7,             [pb_00000000000000000000000000FF00FF]
+    pslldq      m4,             4
+    por         m4,             m7              ;m4 = word[C, A, A, 9, 9, 0, 0, 1]
+
+    pmaddwd     m4,             [r2 + 1 * 16]
+    packssdw    m5,             m4
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m6,             m5
+    mova        [r0 + 272],     m6
+    movd        [r0 + 324],     m6              ;mode 22 row 1
+
+; mode 20
+
+    pmaddwd     m5,             m1,     [r3 + 4 * 16]
+
+    movd        m4,             [r1 + 10]
+    pand        m3,             [pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000]
+    pand        m4,             [pb_000000000000000000000000000000FF]
+    por         m3,             m4              ;m3 = word[A, 0, 0, 1, 1, 2, 2, 3]
+
+    pmaddwd     m6,             m3,     [r2 - 1 * 16]
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    pmaddwd     m4,             m3,     [r3 - 6 * 16]
+
+    punpcklwd   m0,             m3
+    pand        m0,             [pb_00000000000000000000000000FF00FF]
+    mova        m6,             m3
+    pslldq      m6,             4
+    por         m0,             m6              ;m0 = word[B, A, A, 0, 0, 1, 1, 2]
+
+    pmaddwd     m6,             m0,     [r3 + 5 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    packuswb    m5,             m4
+    mova        [r0 + 288],     m5
+
+; mode 21
+
+    pmaddwd     m4,             m1,     [r3 + 8 * 16]
+    pmaddwd     m6,             m3,     [r2 + 7 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m5,             m3,     [r3 + 6 * 16]
+
+    pand        m0,             [pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000]
+    pand        m7,             [pb_000000000000000000000000000000FF]
+    por         m0,             m7              ;m0 = word[C, A, A, 0, 0, 1, 1, 2]
+
+    pmaddwd     m0,             [r2 + 5 * 16]
+    packssdw    m5,             m0
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m4,             m5
+    mova        [r0 + 304],     m4
+
+; mode 22
+
+    pmaddwd     m4,             m1,     [r2 - 4 * 16]
+    packssdw    m4,             m4
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    mova        m0,             [r3 + 5 * 16]
+    pmaddwd     m5,             m3,     [r2 + 2 * 16]
+    pmaddwd     m6,             m3,     m0
+
+    packssdw    m5,             m6
+    paddw       m5,             m2
+    psraw       m5,             5
+
+    packuswb    m4,             m5
+    movd        [r0 + 320],     m4
+    psrldq      m4,             8
+    movh        [r0 + 328],     m4
+
+; mode 23
+
+    pmaddwd     m4,             m1,     [r2 + 0 * 16]
+    pmaddwd     m5,             m1,     [r3 + 7 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 - 2 * 16]
+
+    pand        m3,             [pw_FFFFFFFFFFFFFFFFFFFFFFFFFFFF0000]
+    por         m3,             m7              ;m3 = word[C, 0, 0, 1, 1, 2, 2, 3]
+
+    pmaddwd     m3,             [r2 + 5 * 16]
+    packssdw    m6,             m3
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 336],     m4
+
+; mode 24
+
+    pmaddwd     m4,             m1,     [r2 + 4 * 16]
+    pmaddwd     m5,             m1,     [r2 - 1 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r2 - 6 * 16]
+    pmaddwd     m0,             m1
+
+    packssdw    m6,             m0
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 352],     m4
+
+; mode 25
+
+    pmaddwd     m4,             m1,     [r2 + 7 * 16]
+    pmaddwd     m5,             m1,     [r2 + 5 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r2 + 3 * 16]
+    pmaddwd     m1,             [r2 + 1 * 16]
+
+    packssdw    m6,             m1
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 368],     m4
+
+; mode 27
+
+    movh        m0,             [r1 + 1]
+    pxor        m7,             m7
+    punpcklbw   m0,             m0
+    psrldq      m0,             1
+    movh        m1,             m0
+    psrldq      m0,             2
+    movh        m3,             m0
+    psrldq      m0,             2
+    punpcklbw   m1,             m7              ;m1 = word[1, 2, 2, 3, 3, 4, 4, 5]
+    punpcklbw   m3,             m7              ;m3 = word[2, 3, 3, 4, 4, 5, 5, 6]
+    punpcklbw   m0,             m7              ;m0 = word[3, 4, 4, 5, 5, 6, 6, 7]
+
+    mova        m7,             [r3 - 3 * 16]
+
+    pmaddwd     m4,             m1,     [r3 - 5 * 16]
+    pmaddwd     m5,             m1,     m7
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 - 1 * 16]
+    pmaddwd     m5,             m1,     [r3 + 1 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 400],     m4
+
+; mode 28
+
+    pmaddwd     m4,             m1,     [r3 - 2 * 16]
+    pmaddwd     m5,             m1,     [r3 + 3 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r3 + 8 * 16]
+    pmaddwd     m5,             m1,     [r2 - 3 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 416],     m4
+
+; mode 29
+
+    pmaddwd     m4,             m1,     [r3 + 2 * 16]
+    pmaddwd     m6,             m1,     [r2 - 5 * 16]
+
+    packssdw    m4,             m6
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m1,     [r2 + 4 * 16]
+    pmaddwd     m5,             m3,     m7
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 432],     m4
+
+; mode 30
+
+    pmaddwd     m4,             m1,     [r3 + 6 * 16]
+    pmaddwd     m5,             m1,     [r2 + 3 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m3,     [r3 + 0 * 16]
+    pmaddwd     m5,             m3,     [r2 - 3 * 16]
+
+    packssdw    m6,             m5
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 448],     m4
+    psrldq      m4,             4
+    movh        [r0 + 496],     m4              ;mode 33 row 0
+    psrldq      m4,             8
+    movd        [r0 + 500],     m4              ;mode 33 row 1
+
+; mode 31
+
+    pmaddwd     m4,             m1,     [r2 - 6 * 16]
+    pmaddwd     m5,             m3,     [r3 - 5 * 16]
+
+    packssdw    m4,             m5
+    paddw       m4,             m2
+    psraw       m4,             5
+
+    pmaddwd     m6,             m3,     [r2 - 4 * 16]
+    pmaddwd     m7,             m0
+
+    packssdw    m6,             m7
+    paddw       m6,             m2
+    psraw       m6,             5
+
+    packuswb    m4,             m6
+    mova        [r0 + 464],     m4
+
+; mode 32
+
+    pmaddwd     m1,             [r2 - 2 * 16]
+    pmaddwd     m5,             m3,     [r3 + 3 * 16]
+
+    packssdw    m1,             m5
+    paddw       m1,             m2
+    psraw       m1,             5
+
+    pmaddwd     m3,             [r2 + 8 * 16]
+    pmaddwd     m5,             m0,     [r2 - 3 * 16]
+    packssdw    m3,             m5
+    paddw       m3,             m2
+    psraw       m3,             5
+
+    packuswb    m1,             m3
+    mova        [r0 + 480],     m1
+
+; mode 33
+
+    pmaddwd     m0,             [r3 + 7 * 16]
+    pxor        m7,             m7
+    movh        m4,             [r1 + 4]
+    punpcklbw   m4,             m4
+    psrldq      m4,             1
+    punpcklbw   m4,             m7
+
+    pmaddwd     m4,             [r3 + 1 * 16]
+
+    packssdw    m0,             m4
+    paddw       m0,             m2
+    psraw       m0,             5
+
+    packuswb    m0,             m0
+    movh        [r0 + 504],     m0
+
+; mode 34
+
+    movh        m7,             [r1 + 2]
+    movd        [r0 + 512],     m7              ;byte[2, 3, 4, 5]
+
+    psrldq      m7,             1
+    movd        [r0 + 516],     m7              ;byte[3, 4, 5, 6]
+
+    psrldq      m7,             1
+    movd        [r0 + 520],     m7              ;byte[4, 5, 6, 7]
+
+    psrldq      m7,             1
+    movd        [r0 + 524],     m7              ;byte[5, 6, 7, 8]
+
+RET


More information about the x265-devel mailing list