[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2

chen chenm003 at 163.com
Sat Apr 11 09:34:15 CEST 2015


add data comment increment readable
some suggest inline below

At 2015-04-11 09:58:38,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1428717487 25200
># Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
># Parent  ee76a15fa312ac59549965821d9cbff03237226f
>asm: intra pred all_angs_pred_4x4 sse2
>
>This replaces c code and is backported from sse4
>The processing of modes 10 and 26 were merged and moved to after mode 2
>
>64-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4	9.99x 	 6449.98  	 64435.56
>
>32-bit
>
>./test/TestBench --testbench intrapred | grep intra_allangs4x4
>intra_allangs4x4	13.31x 	 6512.49  	 86709.86
>
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp	Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 10 18:58:07 2015 -0700
>@@ -1259,6 +1259,8 @@
>         p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
>         p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
> 
>+        p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
>+
>         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
>         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
> 
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
>--- a/source/common/x86/const-a.asm	Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/const-a.asm	Fri Apr 10 18:58:07 2015 -0700
>@@ -53,6 +53,10 @@
> const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
> const pb_movemask,          times 16 db 0x00
>                             times 16 db 0xFF
>+const pb_0000000000000F0F,  times 2  db 0xff, 0x00

constant name mistake

>+                            times 14 db 0x00
>+const pb_000000000000000F,           db 0xff
>+                            times 15 db 0x00
> 
> ;; 16-bit constants
> 
>@@ -94,6 +98,8 @@
> const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
> const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
> const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
>+const pw_FFFFFFF0,                   dw 0x00
>+                            times 7  dw 0xff
> 
> 
> ;; 32-bit constants
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
>--- a/source/common/x86/intrapred.h	Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred.h	Fri Apr 10 18:58:07 2015 -0700
>@@ -275,6 +275,7 @@
> void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
>+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
>diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
>--- a/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 10:24:55 2015 -0500
>+++ b/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 18:58:07 2015 -0700
>@@ -34,10 +34,17 @@
> 
> ; common constant with intrapred8.asm
> cextern ang_table
>+cextern pw_ang_table
> cextern tab_S1
> cextern tab_S2
> cextern tab_Si
> 
>+; constants from const-a.asm
>+cextern pw_16
>+cextern pb_000000000000000F
>+cextern pb_0000000000000F0F
>+cextern pw_FFFFFFF0
>+
> 
> ;-----------------------------------------------------------------------------
> ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>@@ -23006,3 +23013,780 @@
>     palignr    m4,              m2,       m1,    14
>     movu       [r0 + 2111 * 16],   m4
>     RET
>+
>+;-----------------------------------------------------------------------------
>+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse2
>+cglobal all_angs_pred_4x4, 4, 4, 8
>+
>+; mode 2
>+
>+    movh        m6,             [r1 + 9]
>+    movh        m2,             m6
mova mapping to register rename

>+    psrldq      m2,             1
>+    movd        [r0],           m2
>+    psrldq      m2,             1
>+    movd        [r0 + 4],       m2
>+    psrldq      m2,             1
>+    movd        [r0 + 8],       m2
>+    psrldq      m2,             1
>+    movd        [r0 + 12],      m2
>+
>+; mode 10/26
>+
>+    pxor        m7,             m7
>+    pshufd      m5,             m6,        0
>+    movu        [r0 + 128],     m5              ;mode 10
>+
>+    movd        m4,             [r1 + 1]
>+    pshufd      m4,             m4,        0
>+    movu        [r0 + 384],     m4              ;mode 26
>+
>+    movd        m1,             [r1]
>+    punpcklbw   m1,             m7
>+    pshuflw     m1,             m1,     0x00
>+    punpcklqdq  m1,             m1
>+
>+    punpckldq   m4,             m5
>+    punpcklbw   m4,             m7
>+    pshuflw     m2,             m4,     0x00
>+    pshufhw     m2,             m2,     0x00
>+
>+    psubw       m4,             m1
>+    psraw       m4,             1
>+
>+    pshufd      m2,             m2,     q1032
>+    paddw       m4,             m2
>+    packuswb    m4,             m4
>+
>+%if ARCH_X86_64
>+    movq        r2,             m4
>+
>+    mov         [r0 + 128],     r2b              ;mode 10
>+    shr         r2,             8
>+    mov         [r0 + 132],     r2b
>+    shr         r2,             8
>+    mov         [r0 + 136],     r2b
>+    shr         r2,             8
>+    mov         [r0 + 140],     r2b
>+    shr         r2,             8
>+    mov         [r0 + 384],     r2b              ;mode 26
>+    shr         r2d,            8
>+    mov         [r0 + 388],     r2b
>+    shr         r2d,            8
>+    mov         [r0 + 392],     r2b
>+    shr         r2d,            8
>+    mov         [r0 + 396],     r2b
>+
>+%else
>+    movd        r2d,             m4
>+
>+    mov         [r0 + 128],     r2b              ;mode 10
>+    shr         r2d,             8
>+    mov         [r0 + 132],     r2b
>+    shr         r2d,             8
>+    mov         [r0 + 136],     r2b
>+    shr         r2d,             8
>+    mov         [r0 + 140],     r2b
>+
>+    psrldq      m4,             4
>+    movd        r2d,            m4
>+
>+    mov         [r0 + 384],     r2b              ;mode 26
>+    shr         r2d,            8
>+    mov         [r0 + 388],     r2b
>+    shr         r2d,            8
>+    mov         [r0 + 392],     r2b
>+    shr         r2d,            8
>+    mov         [r0 + 396],     r2b
>+%endif
>+
>+; mode 3
>+
>+    mova        m2,             [pw_16]
>+    lea         r3,             [pw_ang_table]
>+
>+    punpcklbw   m6,             m6
>+    psrldq      m6,             1
>+    movh        m1,             m6
when we keep MOVH here, we can avoid memory operator in mode 11,13,15,17,etc

>+    psrldq      m6,             2
>+    movh        m0,             m6
>+    psrldq      m6,             2
>+    movh        m3,             m6
>+    psrldq      m6,             2
>+    punpcklbw   m1,             m7
>+    punpcklbw   m0,             m7
>+    punpcklbw   m3,             m7
>+    punpcklbw   m6,             m7
>+
>+    mova        m7,             [r3 + 20 * 16]
offset more than 128 will generate 4-bytes address code


>+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
>+    pmaddwd     m4,             m0,     m7
>+
>+    packssdw    m5,             m4
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m3,     [r3 + 14 * 16]
>+    pmaddwd     m6,             [r3 + 8 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 16],      m5
>+    movd        [r0 + 68],      m5                      ;mode 6 row 1
>+    psrldq      m5,             4
>+    movd        [r0 + 76],      m5                      ;mode 6 row 3
>+
>+; mode 4
>+
>+    pmaddwd     m4,             m0,     [r3 + 31 * 16]
>+    pmaddwd     m6,             m3,     m7
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m5,             m1,     [r3 + 21 * 16]
>+    pmaddwd     m6,             m0,     [r3 + 10 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 32],      m5
>+
>+; mode 5
>+
>+    pmaddwd     m5,             m1,     [r3 + 17 * 16]
>+    pmaddwd     m6,             m0,     [r3 + 2 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m0,     [r3 + 19 * 16]
>+    pmaddwd     m3,             [r3 + 4 * 16]
>+
>+    packssdw    m4,             m3
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 48],      m5
>+
>+; mode 6
>+
>+    pmaddwd     m5,             m1,     [r3 + 13 * 16]
>+    pmaddwd     m6,             m0,     [r3 + 7 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    packuswb    m5,             m6
>+    movd        [r0 + 64],      m5
>+    psrldq      m5,             4
>+    movd        [r0 + 72],      m5
>+
>+; mode 7
>+
>+    pmaddwd     m5,             m1,     [r3 + 9 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    mova        m3,             [r3 + 27 * 16]
>+    pmaddwd     m4,             m1,     m3
>+    pmaddwd     m0,             [r3 + 4 * 16]
>+
>+    packssdw    m4,             m0
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 80],      m5
>+
>+; mode 8
>+
>+    mova        m0,             [r3 + 5 * 16]
>+    pmaddwd     m5,             m1,     m0
>+    pmaddwd     m6,             m1,     [r3 + 10 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
>+    pmaddwd     m7,             m1
>+
>+    packssdw    m4,             m7
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 96],      m5
>+
>+; mode 9
>+
>+    pmaddwd     m5,             m1,     [r3 + 2 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 4 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m1,     [r3 + 6 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 8 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 112],     m5
>+
>+; mode 11
>+
>+    movd        m5,             [r1]
>+    punpcklwd   m5,             m1
>+    pand        m5,             [pb_0000000000000F0F]
you just want to get lowest 2 Word, and both m1, m5 high QWord are zero, so can replace by PSHUFD




>+    pslldq      m1,             4
>+    por         m1,             m5
>+
>+    pmaddwd     m5,             m1,     [r3 + 30 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 28 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m1,     [r3 + 26 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 24 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 144],     m5
>+
>+; mode 12
>+
>+    pmaddwd     m3,             m1
>+    pmaddwd     m6,             m1,     [r3 + 22 * 16]
>+
>+    packssdw    m3,             m6
>+    paddw       m3,             m2
>+    psraw       m3,             5
>+
>+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m3,             m4
>+    mova        [r0 + 160],     m3
>+
>+; mode 13
>+
>+    mova        m3,             m1
>+    movd        m7,             [r1 + 4]
>+    punpcklwd   m7,             m1
>+    pand        m7,             [pb_0000000000000F0F]
>+    pslldq      m3,             4
>+    por         m3,             m7
>+
>+    pmaddwd     m5,             m1,     [r3 + 23 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 14 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m1,     m0
>+    pmaddwd     m6,             m3,     [r3 + 28 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 176],     m5
>+
>+; mode 14
>+
>+    pmaddwd     m5,             m1,     [r3 + 19 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    movd        m6,             [r1 + 2]
>+    pand        m3,             [pw_FFFFFFF0]
>+    pand        m6,             [pb_000000000000000F]
>+    por         m3,             m6
>+
>+    pmaddwd     m4,             m3,     [r3 + 25 * 16]
>+    pmaddwd     m6,             m3,     [r3 + 12 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 192],     m5
>+    psrldq      m5,             4
>+    movd        [r0 + 240],     m5                              ;mode 17 row 0
>+
>+; mode 15
>+
>+    pmaddwd     m5,             m1,     [r3 + 15 * 16]
>+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m6,             m3,     [r3 + 13 * 16]
>+
>+    mova        m0,             m3
>+    punpcklwd   m7,             m3
>+    pslldq      m0,             4
>+    pand        m7,             [pb_0000000000000F0F]
>+    por         m0,             m7
>+
>+    pmaddwd     m4,             m0,     [r3 + 28 * 16]
>+
>+    packssdw    m6,             m4
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m5,             m6
>+    mova        [r0 + 208],     m5
>+
>+; mode 16
>+
>+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
>+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m3,             [r3 + 1 * 16]
>+
>+    movd        m6,             [r1 + 3]
>+    pand        m0,             [pw_FFFFFFF0]
>+    pand        m6,             [pb_000000000000000F]
>+    por         m0,             m6
>+
>+    pmaddwd     m0,             [r3 + 12 * 16]
>+    packssdw    m3,             m0
>+    paddw       m3,             m2
>+    psraw       m3,             5
>+
>+    packuswb    m5,             m3
>+    mova        [r0 + 224],     m5
>+
>+; mode 17
>+
>+    movd        m4,             [r1 + 1]
>+    punpcklwd   m4,             m1
>+    pand        m4,             [pb_0000000000000F0F]
>+    pslldq      m1,             4
>+    por         m1,             m4
>+
>+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
>+
>+    packssdw    m6,             m6
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    movh        m5,             [r1 + 2]
>+    punpcklwd   m5,             m1
>+    pand        m5,             [pb_0000000000000F0F]
>+    pslldq      m1,             4
>+    por         m1,             m5
>+
>+    pmaddwd     m4,             m1,     [r3 + 18 * 16]
>+
>+    punpcklwd   m7,             m1
>+    pand        m7,             [pb_0000000000000F0F]
>+    pslldq      m1,             4
>+    por         m1,             m7
>+
>+    pmaddwd     m1,             [r3 + 24 * 16]
>+    packssdw    m4,             m1
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m6,             m4
>+    movd        [r0 + 244],     m6
>+    psrldq      m6,             8
>+    movh        [r0 + 248],     m6
>+
>+; mode 18
>+
>+    movh        m1,             [r1]
>+    movd        [r0 + 256],     m1
>+
>+    movh        m3,             [r1 + 2]
>+    punpcklqdq  m3,             m1
>+    psrldq      m3,             7
>+    movd        [r0 + 260],     m3
>+
>+    movh        m4,             [r1 + 3]
>+    punpcklqdq  m4,             m3
>+    psrldq      m4,             7
>+    movd        [r0 + 264],     m4
>+
>+    movh        m0,             [r1 + 4]
>+    punpcklqdq  m0,             m4
>+    psrldq      m0,             7
>+    movd        [r0 + 268],     m0
>+
>+; mode 19
>+
>+    pxor        m7,             m7
>+    punpcklbw   m4,             m3
>+    punpcklbw   m3,             m1
>+    punpcklbw   m1,             m1
>+    punpcklbw   m4,             m7
>+    punpcklbw   m3,             m7
>+    psrldq      m1,             1
>+    punpcklbw   m1,             m7
>+
>+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
>+    pmaddwd     m7,             m3,     [r3 + 12 * 16]
>+
>+    packssdw    m6,             m7
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    pmaddwd     m5,             m4,     [r3 + 18 * 16]
>+
>+    movd        m7,             [r1 + 12]
>+    punpcklwd   m7,             m4
>+    pand        m7,             [pb_0000000000000F0F]
>+    pslldq      m4,             4
>+    por         m4,             m7
>+
>+    pmaddwd     m4,             [r3 + 24 * 16]
>+    packssdw    m5,             m4
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    packuswb    m6,             m5
>+    mova        [r0 + 272],     m6
>+    movd        [r0 + 324],     m6                              ;mode 22 row 1
>+
>+; mode 20
>+
>+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
>+
>+    movd        m4,             [r1 + 10]
>+    pand        m3,             [pw_FFFFFFF0]
>+    pand        m4,             [pb_000000000000000F]
>+    por         m3,             m4
>+
>+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    pmaddwd     m4,             m3,     [r3 + 1 * 16]
>+
>+    punpcklwd   m0,             m3
>+    pand        m0,             [pb_0000000000000F0F]
>+    mova        m6,             m3
>+    pslldq      m6,             4
>+    por         m0,             m6
>+
>+    pmaddwd     m6,             m0,     [r3 + 12 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    packuswb    m5,             m4
>+    mova        [r0 + 288],     m5
>+
>+; mode 21
>+
>+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
>+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m5,             m3,     [r3 + 13 * 16]
>+
>+    pand        m0,             [pw_FFFFFFF0]
>+    pand        m7,             [pb_000000000000000F]
>+    por         m0,             m7
>+
>+    pmaddwd     m0,             [r3 + 28 * 16]
>+    packssdw    m5,             m0
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    packuswb    m4,             m5
>+    mova        [r0 + 304],     m4
>+
>+; mode 22
>+
>+    pmaddwd     m4,             m1,     [r3 + 19 * 16]
>+    packssdw    m4,             m4
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    mova        m0,             [r3 + 12 * 16]
>+    pmaddwd     m5,             m3,     [r3 + 25 * 16]
>+    pmaddwd     m6,             m3,     m0
>+
>+    packssdw    m5,             m6
>+    paddw       m5,             m2
>+    psraw       m5,             5
>+
>+    packuswb    m4,             m5
>+    movd        [r0 + 320],     m4
>+    psrldq      m4,             8
>+    movh        [r0 + 328],     m4
>+
>+; mode 23
>+
>+    pmaddwd     m4,             m1,     [r3 + 23 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 14 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 5 * 16]
>+
>+    pand        m3,             [pw_FFFFFFF0]
>+    por         m3,             m7
>+
>+    pmaddwd     m3,             [r3 + 28 * 16]
>+    packssdw    m6,             m3
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 336],     m4
>+
>+; mode 24
>+
>+    pmaddwd     m4,             m1,     [r3 + 27 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 22 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 17 * 16]
>+    pmaddwd     m0,             m1
>+
>+    packssdw    m6,             m0
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 352],     m4
>+
>+; mode 25
>+
>+    pmaddwd     m4,             m1,     [r3 + 30 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 28 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 26 * 16]
>+    pmaddwd     m1,             [r3 + 24 * 16]
>+
>+    packssdw    m6,             m1
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 368],     m4
>+
>+; mode 27
>+
>+    movh        m0,             [r1 + 1]
>+    pxor        m7,             m7
>+    punpcklbw   m0,             m0
>+    psrldq      m0,             1
>+    movh        m1,             m0
>+    psrldq      m0,             2
>+    movh        m3,             m0
>+    psrldq      m0,             2
>+    punpcklbw   m1,             m7
>+    punpcklbw   m3,             m7
>+    punpcklbw   m0,             m7
>+
>+    mova        m7,             [r3 + 4 * 16]
>+
>+    pmaddwd     m4,             m1,     [r3 + 2 * 16]
>+    pmaddwd     m5,             m1,     m7
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 8 * 16]
>+
>+    packssdw    m6,             m5
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 400],     m4
>+
>+; mode 28
>+
>+    pmaddwd     m4,             m1,     [r3 + 5 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 10 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 15 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 20 * 16]
>+
>+    packssdw    m6,             m5
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 416],     m4
>+
>+; mode 29
>+
>+    pmaddwd     m4,             m1,     [r3 + 9 * 16]
>+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
>+
>+    packssdw    m4,             m6
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m1,     [r3 + 27 * 16]
>+    pmaddwd     m5,             m3,     m7
>+
>+    packssdw    m6,             m5
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 432],     m4
>+
>+; mode 30
>+
>+    pmaddwd     m4,             m1,     [r3 + 13 * 16]
>+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m3,     [r3 + 7 * 16]
>+    pmaddwd     m5,             m3,     [r3 + 20 * 16]
>+
>+    packssdw    m6,             m5
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 448],     m4
>+    psrldq      m4,             4
>+    movh        [r0 + 496],     m4                      ;mode 33 row 0
>+    psrldq      m4,             8
>+    movd        [r0 + 500],     m4                      ;mode 33 row 1
>+
>+; mode 31
>+
>+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
>+    pmaddwd     m5,             m3,     [r3 + 2 * 16]
>+
>+    packssdw    m4,             m5
>+    paddw       m4,             m2
>+    psraw       m4,             5
>+
>+    pmaddwd     m6,             m3,     [r3 + 19 * 16]
>+    pmaddwd     m7,             m0;,     [r3 + 4 * 16]
>+
>+    packssdw    m6,             m7
>+    paddw       m6,             m2
>+    psraw       m6,             5
>+
>+    packuswb    m4,             m6
>+    mova        [r0 + 464],     m4
>+
>+; mode 32
>+
>+    pmaddwd     m1,             [r3 + 21 * 16]
>+    pmaddwd     m5,             m3,     [r3 + 10 * 16]
>+
>+    packssdw    m1,             m5
>+    paddw       m1,             m2
>+    psraw       m1,             5
>+
>+    pmaddwd     m3,             [r3 + 31 * 16]
>+    pmaddwd     m5,             m0,     [r3 + 20 * 16]
>+    packssdw    m3,             m5
>+    paddw       m3,             m2
>+    psraw       m3,             5
>+
>+    packuswb    m1,             m3
>+    mova        [r0 + 480],     m1
>+
>+; mode 33
>+
>+    pmaddwd     m0,             [r3 + 14 * 16]
>+    pxor        m7,             m7
>+    movh        m4,             [r1 + 4]
>+    punpcklbw   m4,             m4
>+    psrldq      m4,             1
>+    punpcklbw   m4,             m7
>+
>+    pmaddwd     m4,             [r3 + 8 * 16]
>+
>+    packssdw    m0,             m4
>+    paddw       m0,             m2
>+    psraw       m0,             5
>+
>+    packuswb    m0,             m0
>+    movh        [r0 + 504],     m0
>+
>+; mode 34
>+
>+    movh        m7,             [r1 + 2]
>+    movd        [r0 + 512],     m7
>+
>+    psrldq      m7,             1
>+    movd        [r0 + 516],     m7
>+
>+    psrldq      m7,             1
>+    movd        [r0 + 520],     m7
>+
>+    psrldq      m7,             1
>+    movd        [r0 + 524],     m7
>+
>+RET
>\ No newline at end of file
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150411/3d369a90/attachment-0001.html>


More information about the x265-devel mailing list