[x265] [PATCH] asm: intra pred all_angs_pred_4x4 sse2

dave dtyx265 at gmail.com
Sat Apr 11 20:06:41 CEST 2015


On 04/11/2015 12:34 AM, chen wrote:
> add data comment increment readable
Can you explain a little more about this?
> some suggest inline below
responses below
> At 2015-04-11 09:58:38,dtyx265 at gmail.com wrote:
> ># HG changeset patch
> ># User David T Yuen <dtyx265 at gmail.com>
> ># Date 1428717487 25200
> ># Node ID c40653978caea4a4bf8940ae3b0e8db74bbe07d7
> ># Parent  ee76a15fa312ac59549965821d9cbff03237226f
> >asm: intra pred all_angs_pred_4x4 sse2
> >
> >This replaces c code and is backported from sse4
> >The processing of modes 10 and 26 were merged and moved to after mode 2
> >
> >64-bit
> >
> >./test/TestBench --testbench intrapred | grep intra_allangs4x4
> >intra_allangs4x4	9.99x 	 6449.98  	 64435.56
> >
> >32-bit
> >
> >./test/TestBench --testbench intrapred | grep intra_allangs4x4
> >intra_allangs4x4	13.31x 	 6512.49  	 86709.86
> >
> >diff -r ee76a15fa312 -r c40653978cae source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp	Fri Apr 10 10:24:55 2015 -0500
> >+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 10 18:58:07 2015 -0700
> >@@ -1259,6 +1259,8 @@
> >         p.cu[BLOCK_4x4].intra_pred[32] = x265_intra_pred_ang4_4_sse2;
> >         p.cu[BLOCK_4x4].intra_pred[33] = x265_intra_pred_ang4_3_sse2;
> >
> >+        p.cu[BLOCK_4x4].intra_pred_allangs = x265_all_angs_pred_4x4_sse2;
> >+
> >         p.cu[BLOCK_4x4].calcresidual = x265_getResidual4_sse2;
> >         p.cu[BLOCK_8x8].calcresidual = x265_getResidual8_sse2;
> >
> >diff -r ee76a15fa312 -r c40653978cae source/common/x86/const-a.asm
> >--- a/source/common/x86/const-a.asm	Fri Apr 10 10:24:55 2015 -0500
> >+++ b/source/common/x86/const-a.asm	Fri Apr 10 18:58:07 2015 -0700
> >@@ -53,6 +53,10 @@
> > const pb_shuf8x8c,          times  1 db   0,   0,   0,   0,   2,   2,   2,   2,   4,   4,   4,   4,   6,   6,   6,   6
> > const pb_movemask,          times 16 db 0x00
> >                             times 16 db 0xFF
> >+const pb_0000000000000F0F,  times 2  db 0xff, 0x00
>
> constant name mistake
I was trying to keep it short but if you prefer 
pb_00000000000000000000000000FF00FF I can do that. I can do the same for 
the other constants.
> >+                            times 14 db 0x00
Also, I should have made this 12, not 14
> >+const pb_000000000000000F,           db 0xff
> >+                            times 15 db 0x00
> >
> > ;; 16-bit constants
> >
> >@@ -94,6 +98,8 @@
> > const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
> > const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
> > const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
> >+const pw_FFFFFFF0,                   dw 0x00
> >+                            times 7  dw 0xff
> >
> >
> > ;; 32-bit constants
> >diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred.h
> >--- a/source/common/x86/intrapred.h	Fri Apr 10 10:24:55 2015 -0500
> >+++ b/source/common/x86/intrapred.h	Fri Apr 10 18:58:07 2015 -0700
> >@@ -275,6 +275,7 @@
> > void x265_intra_pred_ang32_23_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> > void x265_intra_pred_ang32_22_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> > void x265_intra_pred_ang32_21_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
> >+void x265_all_angs_pred_4x4_sse2(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> > void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> > void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> > void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
> >diff -r ee76a15fa312 -r c40653978cae source/common/x86/intrapred8_allangs.asm
> >--- a/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 10:24:55 2015 -0500
> >+++ b/source/common/x86/intrapred8_allangs.asm	Fri Apr 10 18:58:07 2015 -0700
> >@@ -34,10 +34,17 @@
> >
> > ; common constant with intrapred8.asm
> > cextern ang_table
> >+cextern pw_ang_table
> > cextern tab_S1
> > cextern tab_S2
> > cextern tab_Si
> >
> >+; constants from const-a.asm
> >+cextern pw_16
> >+cextern pb_000000000000000F
> >+cextern pb_0000000000000F0F
> >+cextern pw_FFFFFFF0
> >+
> >
> > ;-----------------------------------------------------------------------------
> > ; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
> >@@ -23006,3 +23013,780 @@
> >     palignr    m4,              m2,       m1,    14
> >     movu       [r0 + 2111 * 16],   m4
> >     RET
> >+
> >+;-----------------------------------------------------------------------------
> >+; void all_angs_pred_4x4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma)
> >+;-----------------------------------------------------------------------------
> >+INIT_XMM sse2
> >+cglobal all_angs_pred_4x4, 4, 4, 8
> >+
> >+; mode 2
> >+
> >+    movh        m6,             [r1 + 9]
> >+    movh        m2,             m6
> mova mapping to register rename
Will do.
> >+    psrldq      m2,             1
> >+    movd        [r0],           m2
> >+    psrldq      m2,             1
> >+    movd        [r0 + 4],       m2
> >+    psrldq      m2,             1
> >+    movd        [r0 + 8],       m2
> >+    psrldq      m2,             1
> >+    movd        [r0 + 12],      m2
> >+
> >+; mode 10/26
> >+
> >+    pxor        m7,             m7
> >+    pshufd      m5,             m6,        0
> >+    movu        [r0 + 128],     m5              ;mode 10
> >+
> >+    movd        m4,             [r1 + 1]
> >+    pshufd      m4,             m4,        0
> >+    movu        [r0 + 384],     m4              ;mode 26
> >+
> >+    movd        m1,             [r1]
> >+    punpcklbw   m1,             m7
> >+    pshuflw     m1,             m1,     0x00
> >+    punpcklqdq  m1,             m1
> >+
> >+    punpckldq   m4,             m5
> >+    punpcklbw   m4,             m7
> >+    pshuflw     m2,             m4,     0x00
> >+    pshufhw     m2,             m2,     0x00
> >+
> >+    psubw       m4,             m1
> >+    psraw       m4,             1
> >+
> >+    pshufd      m2,             m2,     q1032
> >+    paddw       m4,             m2
> >+    packuswb    m4,             m4
> >+
> >+%if ARCH_X86_64
> >+    movq        r2,             m4
> >+
> >+    mov         [r0 + 128],     r2b              ;mode 10
> >+    shr         r2,             8
> >+    mov         [r0 + 132],     r2b
> >+    shr         r2,             8
> >+    mov         [r0 + 136],     r2b
> >+    shr         r2,             8
> >+    mov         [r0 + 140],     r2b
> >+    shr         r2,             8
> >+    mov         [r0 + 384],     r2b              ;mode 26
> >+    shr         r2d,            8
> >+    mov         [r0 + 388],     r2b
> >+    shr         r2d,            8
> >+    mov         [r0 + 392],     r2b
> >+    shr         r2d,            8
> >+    mov         [r0 + 396],     r2b
> >+
> >+%else
> >+    movd        r2d,             m4
> >+
> >+    mov         [r0 + 128],     r2b              ;mode 10
> >+    shr         r2d,             8
> >+    mov         [r0 + 132],     r2b
> >+    shr         r2d,             8
> >+    mov         [r0 + 136],     r2b
> >+    shr         r2d,             8
> >+    mov         [r0 + 140],     r2b
> >+
> >+    psrldq      m4,             4
> >+    movd        r2d,            m4
> >+
> >+    mov         [r0 + 384],     r2b              ;mode 26
> >+    shr         r2d,            8
> >+    mov         [r0 + 388],     r2b
> >+    shr         r2d,            8
> >+    mov         [r0 + 392],     r2b
> >+    shr         r2d,            8
> >+    mov         [r0 + 396],     r2b
> >+%endif
> >+
> >+; mode 3
> >+
> >+    mova        m2,             [pw_16]
> >+    lea         r3,             [pw_ang_table]
> >+
> >+    punpcklbw   m6,             m6
> >+    psrldq      m6,             1
> >+    movh        m1,             m6
> when we keep MOVH here, we can avoid memory operator in mode 11,13,15,17,etc
>
> >+    psrldq      m6,             2
> >+    movh        m0,             m6
> >+    psrldq      m6,             2
> >+    movh        m3,             m6
> >+    psrldq      m6,             2
> >+    punpcklbw   m1,             m7
> >+    punpcklbw   m0,             m7
> >+    punpcklbw   m3,             m7
> >+    punpcklbw   m6,             m7
> >+
> >+    mova        m7,             [r3 + 20 * 16]
> offset more than 128 will generate 4-bytes address code
I will adjust r3 and use r2
>
> >+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
> >+    pmaddwd     m4,             m0,     m7
> >+
> >+    packssdw    m5,             m4
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m3,     [r3 + 14 * 16]
> >+    pmaddwd     m6,             [r3 + 8 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 16],      m5
> >+    movd        [r0 + 68],      m5                      ;mode 6 row 1
> >+    psrldq      m5,             4
> >+    movd        [r0 + 76],      m5                      ;mode 6 row 3
> >+
> >+; mode 4
> >+
> >+    pmaddwd     m4,             m0,     [r3 + 31 * 16]
> >+    pmaddwd     m6,             m3,     m7
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 21 * 16]
> >+    pmaddwd     m6,             m0,     [r3 + 10 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 32],      m5
> >+
> >+; mode 5
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 17 * 16]
> >+    pmaddwd     m6,             m0,     [r3 + 2 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m0,     [r3 + 19 * 16]
> >+    pmaddwd     m3,             [r3 + 4 * 16]
> >+
> >+    packssdw    m4,             m3
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 48],      m5
> >+
> >+; mode 6
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 13 * 16]
> >+    pmaddwd     m6,             m0,     [r3 + 7 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    packuswb    m5,             m6
> >+    movd        [r0 + 64],      m5
> >+    psrldq      m5,             4
> >+    movd        [r0 + 72],      m5
> >+
> >+; mode 7
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 9 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    mova        m3,             [r3 + 27 * 16]
> >+    pmaddwd     m4,             m1,     m3
> >+    pmaddwd     m0,             [r3 + 4 * 16]
> >+
> >+    packssdw    m4,             m0
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 80],      m5
> >+
> >+; mode 8
> >+
> >+    mova        m0,             [r3 + 5 * 16]
> >+    pmaddwd     m5,             m1,     m0
> >+    pmaddwd     m6,             m1,     [r3 + 10 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
> >+    pmaddwd     m7,             m1
> >+
> >+    packssdw    m4,             m7
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 96],      m5
> >+
> >+; mode 9
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 2 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 4 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 6 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 8 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 112],     m5
> >+
> >+; mode 11
> >+
> >+    movd        m5,             [r1]
> >+    punpcklwd   m5,             m1
> >+    pand        m5,             [pb_0000000000000F0F]
> you just want to get lowest 2 Word, and both m1, m5 high QWord are zero, so can replace by PSHUFD
>   
The high QWord of m1 is not zero but has needed values.  This code 
changes m1 from DCCBBAA9 to CBBAA990 where each character is the r1 
index of the neighboring pixel expanded to 16 bits.
>
>
> >+    pslldq      m1,             4
> >+    por         m1,             m5
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 30 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 28 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 26 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 24 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 144],     m5
> >+
> >+; mode 12
> >+
> >+    pmaddwd     m3,             m1
> >+    pmaddwd     m6,             m1,     [r3 + 22 * 16]
> >+
> >+    packssdw    m3,             m6
> >+    paddw       m3,             m2
> >+    psraw       m3,             5
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m3,             m4
> >+    mova        [r0 + 160],     m3
> >+
> >+; mode 13
> >+
> >+    mova        m3,             m1
> >+    movd        m7,             [r1 + 4]
> >+    punpcklwd   m7,             m1
> >+    pand        m7,             [pb_0000000000000F0F]
> >+    pslldq      m3,             4
> >+    por         m3,             m7
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 23 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 14 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m1,     m0
> >+    pmaddwd     m6,             m3,     [r3 + 28 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 176],     m5
> >+
> >+; mode 14
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 19 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    movd        m6,             [r1 + 2]
> >+    pand        m3,             [pw_FFFFFFF0]
> >+    pand        m6,             [pb_000000000000000F]
> >+    por         m3,             m6
> >+
> >+    pmaddwd     m4,             m3,     [r3 + 25 * 16]
> >+    pmaddwd     m6,             m3,     [r3 + 12 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 192],     m5
> >+    psrldq      m5,             4
> >+    movd        [r0 + 240],     m5                              ;mode 17 row 0
> >+
> >+; mode 15
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 15 * 16]
> >+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m6,             m3,     [r3 + 13 * 16]
> >+
> >+    mova        m0,             m3
> >+    punpcklwd   m7,             m3
> >+    pslldq      m0,             4
> >+    pand        m7,             [pb_0000000000000F0F]
> >+    por         m0,             m7
> >+
> >+    pmaddwd     m4,             m0,     [r3 + 28 * 16]
> >+
> >+    packssdw    m6,             m4
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m5,             m6
> >+    mova        [r0 + 208],     m5
> >+
> >+; mode 16
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
> >+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m3,             [r3 + 1 * 16]
> >+
> >+    movd        m6,             [r1 + 3]
> >+    pand        m0,             [pw_FFFFFFF0]
> >+    pand        m6,             [pb_000000000000000F]
> >+    por         m0,             m6
> >+
> >+    pmaddwd     m0,             [r3 + 12 * 16]
> >+    packssdw    m3,             m0
> >+    paddw       m3,             m2
> >+    psraw       m3,             5
> >+
> >+    packuswb    m5,             m3
> >+    mova        [r0 + 224],     m5
> >+
> >+; mode 17
> >+
> >+    movd        m4,             [r1 + 1]
> >+    punpcklwd   m4,             m1
> >+    pand        m4,             [pb_0000000000000F0F]
> >+    pslldq      m1,             4
> >+    por         m1,             m4
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 12 * 16]
> >+
> >+    packssdw    m6,             m6
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    movh        m5,             [r1 + 2]
> >+    punpcklwd   m5,             m1
> >+    pand        m5,             [pb_0000000000000F0F]
> >+    pslldq      m1,             4
> >+    por         m1,             m5
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 18 * 16]
> >+
> >+    punpcklwd   m7,             m1
> >+    pand        m7,             [pb_0000000000000F0F]
> >+    pslldq      m1,             4
> >+    por         m1,             m7
> >+
> >+    pmaddwd     m1,             [r3 + 24 * 16]
> >+    packssdw    m4,             m1
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m6,             m4
> >+    movd        [r0 + 244],     m6
> >+    psrldq      m6,             8
> >+    movh        [r0 + 248],     m6
> >+
> >+; mode 18
> >+
> >+    movh        m1,             [r1]
> >+    movd        [r0 + 256],     m1
> >+
> >+    movh        m3,             [r1 + 2]
> >+    punpcklqdq  m3,             m1
> >+    psrldq      m3,             7
> >+    movd        [r0 + 260],     m3
> >+
> >+    movh        m4,             [r1 + 3]
> >+    punpcklqdq  m4,             m3
> >+    psrldq      m4,             7
> >+    movd        [r0 + 264],     m4
> >+
> >+    movh        m0,             [r1 + 4]
> >+    punpcklqdq  m0,             m4
> >+    psrldq      m0,             7
> >+    movd        [r0 + 268],     m0
> >+
> >+; mode 19
> >+
> >+    pxor        m7,             m7
> >+    punpcklbw   m4,             m3
> >+    punpcklbw   m3,             m1
> >+    punpcklbw   m1,             m1
> >+    punpcklbw   m4,             m7
> >+    punpcklbw   m3,             m7
> >+    psrldq      m1,             1
> >+    punpcklbw   m1,             m7
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
> >+    pmaddwd     m7,             m3,     [r3 + 12 * 16]
> >+
> >+    packssdw    m6,             m7
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    pmaddwd     m5,             m4,     [r3 + 18 * 16]
> >+
> >+    movd        m7,             [r1 + 12]
> >+    punpcklwd   m7,             m4
> >+    pand        m7,             [pb_0000000000000F0F]
> >+    pslldq      m4,             4
> >+    por         m4,             m7
> >+
> >+    pmaddwd     m4,             [r3 + 24 * 16]
> >+    packssdw    m5,             m4
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    packuswb    m6,             m5
> >+    mova        [r0 + 272],     m6
> >+    movd        [r0 + 324],     m6                              ;mode 22 row 1
> >+
> >+; mode 20
> >+
> >+    pmaddwd     m5,             m1,     [r3 + 11 * 16]
> >+
> >+    movd        m4,             [r1 + 10]
> >+    pand        m3,             [pw_FFFFFFF0]
> >+    pand        m4,             [pb_000000000000000F]
> >+    por         m3,             m4
> >+
> >+    pmaddwd     m6,             m3,     [r3 + 22 * 16]
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    pmaddwd     m4,             m3,     [r3 + 1 * 16]
> >+
> >+    punpcklwd   m0,             m3
> >+    pand        m0,             [pb_0000000000000F0F]
> >+    mova        m6,             m3
> >+    pslldq      m6,             4
> >+    por         m0,             m6
> >+
> >+    pmaddwd     m6,             m0,     [r3 + 12 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    packuswb    m5,             m4
> >+    mova        [r0 + 288],     m5
> >+
> >+; mode 21
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 15 * 16]
> >+    pmaddwd     m6,             m3,     [r3 + 30 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m5,             m3,     [r3 + 13 * 16]
> >+
> >+    pand        m0,             [pw_FFFFFFF0]
> >+    pand        m7,             [pb_000000000000000F]
> >+    por         m0,             m7
> >+
> >+    pmaddwd     m0,             [r3 + 28 * 16]
> >+    packssdw    m5,             m0
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    packuswb    m4,             m5
> >+    mova        [r0 + 304],     m4
> >+
> >+; mode 22
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 19 * 16]
> >+    packssdw    m4,             m4
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    mova        m0,             [r3 + 12 * 16]
> >+    pmaddwd     m5,             m3,     [r3 + 25 * 16]
> >+    pmaddwd     m6,             m3,     m0
> >+
> >+    packssdw    m5,             m6
> >+    paddw       m5,             m2
> >+    psraw       m5,             5
> >+
> >+    packuswb    m4,             m5
> >+    movd        [r0 + 320],     m4
> >+    psrldq      m4,             8
> >+    movh        [r0 + 328],     m4
> >+
> >+; mode 23
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 23 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 14 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 5 * 16]
> >+
> >+    pand        m3,             [pw_FFFFFFF0]
> >+    por         m3,             m7
> >+
> >+    pmaddwd     m3,             [r3 + 28 * 16]
> >+    packssdw    m6,             m3
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 336],     m4
> >+
> >+; mode 24
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 27 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 22 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 17 * 16]
> >+    pmaddwd     m0,             m1
> >+
> >+    packssdw    m6,             m0
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 352],     m4
> >+
> >+; mode 25
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 30 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 28 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 26 * 16]
> >+    pmaddwd     m1,             [r3 + 24 * 16]
> >+
> >+    packssdw    m6,             m1
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 368],     m4
> >+
> >+; mode 27
> >+
> >+    movh        m0,             [r1 + 1]
> >+    pxor        m7,             m7
> >+    punpcklbw   m0,             m0
> >+    psrldq      m0,             1
> >+    movh        m1,             m0
> >+    psrldq      m0,             2
> >+    movh        m3,             m0
> >+    psrldq      m0,             2
> >+    punpcklbw   m1,             m7
> >+    punpcklbw   m3,             m7
> >+    punpcklbw   m0,             m7
> >+
> >+    mova        m7,             [r3 + 4 * 16]
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 2 * 16]
> >+    pmaddwd     m5,             m1,     m7
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 6 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 8 * 16]
> >+
> >+    packssdw    m6,             m5
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 400],     m4
> >+
> >+; mode 28
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 5 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 10 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 15 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 20 * 16]
> >+
> >+    packssdw    m6,             m5
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 416],     m4
> >+
> >+; mode 29
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 9 * 16]
> >+    pmaddwd     m6,             m1,     [r3 + 18 * 16]
> >+
> >+    packssdw    m4,             m6
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m1,     [r3 + 27 * 16]
> >+    pmaddwd     m5,             m3,     m7
> >+
> >+    packssdw    m6,             m5
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 432],     m4
> >+
> >+; mode 30
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 13 * 16]
> >+    pmaddwd     m5,             m1,     [r3 + 26 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m3,     [r3 + 7 * 16]
> >+    pmaddwd     m5,             m3,     [r3 + 20 * 16]
> >+
> >+    packssdw    m6,             m5
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 448],     m4
> >+    psrldq      m4,             4
> >+    movh        [r0 + 496],     m4                      ;mode 33 row 0
> >+    psrldq      m4,             8
> >+    movd        [r0 + 500],     m4                      ;mode 33 row 1
> >+
> >+; mode 31
> >+
> >+    pmaddwd     m4,             m1,     [r3 + 17 * 16]
> >+    pmaddwd     m5,             m3,     [r3 + 2 * 16]
> >+
> >+    packssdw    m4,             m5
> >+    paddw       m4,             m2
> >+    psraw       m4,             5
> >+
> >+    pmaddwd     m6,             m3,     [r3 + 19 * 16]
> >+    pmaddwd     m7,             m0;,     [r3 + 4 * 16]
> >+
> >+    packssdw    m6,             m7
> >+    paddw       m6,             m2
> >+    psraw       m6,             5
> >+
> >+    packuswb    m4,             m6
> >+    mova        [r0 + 464],     m4
> >+
> >+; mode 32
> >+
> >+    pmaddwd     m1,             [r3 + 21 * 16]
> >+    pmaddwd     m5,             m3,     [r3 + 10 * 16]
> >+
> >+    packssdw    m1,             m5
> >+    paddw       m1,             m2
> >+    psraw       m1,             5
> >+
> >+    pmaddwd     m3,             [r3 + 31 * 16]
> >+    pmaddwd     m5,             m0,     [r3 + 20 * 16]
> >+    packssdw    m3,             m5
> >+    paddw       m3,             m2
> >+    psraw       m3,             5
> >+
> >+    packuswb    m1,             m3
> >+    mova        [r0 + 480],     m1
> >+
> >+; mode 33
> >+
> >+    pmaddwd     m0,             [r3 + 14 * 16]
> >+    pxor        m7,             m7
> >+    movh        m4,             [r1 + 4]
> >+    punpcklbw   m4,             m4
> >+    psrldq      m4,             1
> >+    punpcklbw   m4,             m7
> >+
> >+    pmaddwd     m4,             [r3 + 8 * 16]
> >+
> >+    packssdw    m0,             m4
> >+    paddw       m0,             m2
> >+    psraw       m0,             5
> >+
> >+    packuswb    m0,             m0
> >+    movh        [r0 + 504],     m0
> >+
> >+; mode 34
> >+
> >+    movh        m7,             [r1 + 2]
> >+    movd        [r0 + 512],     m7
> >+
> >+    psrldq      m7,             1
> >+    movd        [r0 + 516],     m7
> >+
> >+    psrldq      m7,             1
> >+    movd        [r0 + 520],     m7
> >+
> >+    psrldq      m7,             1
> >+    movd        [r0 + 524],     m7
> >+
> >+RET
> >\ No newline at end of file
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150411/14f2e53e/attachment-0001.html>


More information about the x265-devel mailing list