[x265] [PATCH] all_angs_pred_4x4, asm code for all modes
Praveen Tiwari
praveen at multicorewareinc.com
Wed Dec 4 15:52:44 CET 2013
Replaced.
Regards,
Praveen Tiwari
2013/12/4 chen <chenm003 at 163.com>
> mova m3, [tab_Zero]
> pxor is faster
>
> At 2013-12-04 21:57:09,praveen at multicorewareinc.com wrote:
>
> ># HG changeset patch
> ># User Praveen Tiwari
> ># Date 1386165418 -19800
> ># Node ID 1e130f825cbbfa805602d4eb381b38e62ff82458
> ># Parent 1d2d60f4eb81882fa0f3ba6c4e7aa9a220968f7a
> >all_angs_pred_4x4, asm code for all modes
> >
> >diff -r 1d2d60f4eb81 -r 1e130f825cbb source/common/x86/asm-primitives.cpp
> >--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 23:56:22 2013 -0600
> >+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 19:26:58 2013 +0530
> >@@ -780,6 +780,8 @@
> > p.intra_pred_planar[BLOCK_8x8] = x265_intra_pred_planar8_sse4;
> > p.intra_pred_planar[BLOCK_16x16] = x265_intra_pred_planar16_sse4;
> > p.intra_pred_planar[BLOCK_32x32] = x265_intra_pred_planar32_sse4;
> >+
> >+ p.intra_pred_allangs[BLOCK_4x4] = x265_all_angs_pred_4x4_sse4;
> > }
> > if (cpuMask & X265_CPU_AVX)
> > {
> >diff -r 1d2d60f4eb81 -r 1e130f825cbb source/common/x86/intrapred.h
> >--- a/source/common/x86/intrapred.h Tue Dec 03 23:56:22 2013 -0600
> >+++ b/source/common/x86/intrapred.h Wed Dec 04 19:26:58 2013 +0530
> >@@ -39,4 +39,6 @@
>
> > void x265_intra_pred_ang4_2_ssse3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
>
> > void x265_intra_pred_ang4_3_ssse3(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter);
> >
>
> >+void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
> >+
> > #endif // ifndef X265_INTRAPRED_H
> >diff -r 1d2d60f4eb81 -r 1e130f825cbb source/common/x86/intrapred8.asm
> >--- a/source/common/x86/intrapred8.asm Tue Dec 03 23:56:22 2013 -0600
> >+++ b/source/common/x86/intrapred8.asm Wed Dec 04 19:26:58 2013 +0530
> >@@ -34,6 +34,8 @@
> >
> > c_trans_4x4 db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15
> >
> >+tab_Zero: db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
> >+
> > const ang_table
> > %assign x 0
> > %rep 32
> >@@ -750,3 +752,793 @@
> > lea r1, [r1 * 3]
> > movd [r0 + r1], m3
> > RET
> >+
>
> >+;-----------------------------------------------------------------------------
>
> >+; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
>
> >+;-----------------------------------------------------------------------------
> >+INIT_XMM sse4
>
> >+cglobal all_angs_pred_4x4, 6, 6, 8 dest, above0, left0, above1, left1, bLuma
> >+
> >+; mode 2
> >+
> >+movh m0, [r2 + 2]
> >+movd [r0], m0
> >+
> >+palignr m1, m0, 1
> >+movd [r0 + 4], m1
> >+
> >+palignr m1, m0, 2
> >+movd [r0 + 8], m1
> >+
> >+psrldq m0, 3
> >+movd [r0 + 12], m0
> >+
> >+; mode 3
> >+
> >+mova m0, [pw_1024]
> >+
> >+movh m1, [r2 + 1]
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+lea r5, [ang_table]
> >+
> >+pmaddubsw m5, m1, [r5 + 26 * 16]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 16], m5
> >+
> >+palignr m2, m1, 2
> >+
> >+pmaddubsw m6, m2, [r5 + 20 * 16]
> >+pmulhrsw m6, m0
> >+packuswb m6, m6
> >+movd [r0 + 20], m6
> >+
> >+palignr m3, m1, 4
> >+
> >+pmaddubsw m4, m3, [r5 + 14 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 24], m4
> >+
> >+palignr m4, m1, 6
> >+
> >+pmaddubsw m4, [r5 + 8 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 28], m4
> >+
> >+; mode 4
> >+
> >+pmaddubsw m4, m1, [r5 + 21 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 32], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 10 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 36], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 31 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 40], m4
> >+
> >+pmaddubsw m4, m3, [r5 + 20 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 44], m4
> >+
> >+; mode 5
> >+
> >+pmaddubsw m4, m1, [r5 + 17 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 48], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 2 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 52], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 19 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 56], m4
> >+
> >+pmaddubsw m3, [r5 + 4 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 60], m3
> >+
> >+; mode 6
> >+
> >+pmaddubsw m3, m1, [r5 + 13 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 64], m3
> >+
> >+movd [r0 + 68], m5
> >+
> >+pmaddubsw m3, m2, [r5 + 7 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 72], m3
> >+
> >+movd [r0 + 76], m6
> >+
> >+; mode 7
> >+
> >+pmaddubsw m3, m1, [r5 + 9 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 80], m3
> >+
> >+pmaddubsw m3, m1, [r5 + 18 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 84], m3
> >+
> >+pmaddubsw m3, m1, [r5 + 27 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 88], m3
> >+
> >+pmaddubsw m2, [r5 + 4 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 92], m2
> >+
> >+; mode 8
> >+
> >+pmaddubsw m2, m1, [r5 + 5 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 96], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 10 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 100], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 15 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 104], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 20 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 108], m2
> >+
> >+; mode 9
> >+
> >+pmaddubsw m2, m1, [r5 + 2 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 112], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 4 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 116], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 6 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 120], m2
> >+
> >+pmaddubsw m1, [r5 + 8 * 16]
> >+pmulhrsw m1, m0
> >+packuswb m1, m1
> >+movd [r0 + 124], m1
> >+
> >+; mode 10
> >+
> >+movh m1, [r2]
> >+palignr m2, m1, 1
> >+pshufd m3, m2, 0
> >+movu [r0 + 128], m3
> >+
> >+mova m3, [tab_Zero]
> >+
> >+pshufb m4, m2, m3
> >+punpcklbw m4, m3
> >+
> >+movh m5, [r1]
> >+
> >+pshufb m6, m5, m3
> >+punpcklbw m6, m3
> >+
> >+psrldq m5, 1
> >+punpcklbw m5, m3
> >+
> >+psubw m5, m6
> >+psraw m5, 1
> >+
> >+paddw m4, m5
> >+
> >+packuswb m4, m3
> >+
> >+pextrb [r0 + 128], m4, 0
> >+pextrb [r0 + 132], m4, 1
> >+pextrb [r0 + 136], m4, 2
> >+pextrb [r0 + 140], m4, 3
> >+
> >+; mode 11
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m2, m1, [r5 + 30 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 144], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 28 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 148], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 26 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 152], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 24 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 156], m2
> >+
> >+; mode 12
> >+
> >+pmaddubsw m2, m1, [r5 + 27 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 160], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 22 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 164], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 17 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 168], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 12 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 172], m2
> >+
> >+; mode 13
> >+
> >+pmaddubsw m2, m1, [r5 + 23 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 176], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 14 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 180], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 5 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 184], m2
> >+
> >+pslldq m2, m1, 2
> >+pinsrb m2, [r1 + 0], 1
> >+pinsrb m2, [r1 + 4], 0
> >+
> >+pmaddubsw m3, m2, [r5 + 28 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 188], m3
> >+
> >+; mode 14
> >+
> >+pmaddubsw m3, m1, [r5 + 19 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 192], m3
> >+
> >+pmaddubsw m5, m1, [r5 + 6 * 16]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 196], m5
> >+
> >+pinsrb m2, [r1 + 2], 0
> >+
> >+pmaddubsw m3, m2, [r5 + 25 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 200], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 12 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 204], m3
> >+
> >+; mode 15
> >+
> >+pmaddubsw m3, m1, [r5 + 15 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 208], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 30 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 212], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 13 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 216], m3
> >+
> >+pslldq m3, m2, 2
> >+pinsrb m3, [r1 + 2], 1
> >+pinsrb m3, [r1 + 4], 0
> >+
> >+pmaddubsw m4, m3, [r5 + 28 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 220], m4
> >+
> >+; mode 16
> >+
> >+pmaddubsw m4, m1, [r5 + 11 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 224], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 22 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 228], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 1 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 232], m4
> >+
> >+pinsrb m3, [r1 + 3], 0
> >+
> >+pmaddubsw m3, [r5 + 12 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 236], m3
> >+
> >+; mode 17
> >+
> >+movd [r0 + 240], m5
> >+
> >+pslldq m1, 2
> >+pinsrb m1, [r1 + 1], 0
> >+pinsrb m1, [r1 + 0], 1
> >+
> >+pmaddubsw m2, m1, [r5 + 12 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 244], m2
> >+
> >+pslldq m1, 2
> >+pinsrb m1, [r1 + 2], 0
> >+pinsrb m1, [r1 + 1], 1
> >+
> >+pmaddubsw m2, m1, [r5 + 18 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 248], m2
> >+
> >+pslldq m1, 2
> >+pinsrb m1, [r1 + 4], 0
> >+pinsrb m1, [r1 + 2], 1
> >+
> >+pmaddubsw m1, [r5 + 24 * 16]
> >+pmulhrsw m1, m0
> >+packuswb m1, m1
> >+movd [r0 + 252], m1
> >+
> >+; mode 18
> >+
> >+movh m1, [r1]
> >+movd [r0 + 256], m1
> >+
> >+pslldq m2, m1, 1
> >+pinsrb m2, [r2 + 1], 0
> >+movd [r0 + 260], m2
> >+
> >+pslldq m3, m2, 1
> >+pinsrb m3, [r2 + 2], 0
> >+movd [r0 + 264], m3
> >+
> >+pslldq m4, m3, 1
> >+pinsrb m4, [r2 + 3], 0
> >+movd [r0 + 268], m4
> >+
> >+; mode 19
> >+
> >+palignr m4, m1, 1
> >+punpcklbw m1, m4
> >+
> >+pmaddubsw m5, m1, [r5 + 6 * 16]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 272], m5
> >+
> >+pslldq m2, m1, 2
> >+pinsrb m2, [r2 + 1], 0
> >+pinsrb m2, [r2], 1
> >+
> >+pmaddubsw m3, m2, [r5 + 12 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 276], m3
> >+
> >+pslldq m3, m2, 2
> >+pinsrb m3, [r2 + 1], 1
> >+pinsrb m3, [r2 + 2], 0
> >+
> >+pmaddubsw m4, m3, [r5 + 18 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 280], m4
> >+
> >+pslldq m3, 2
> >+pinsrb m3, [r2 + 2], 1
> >+pinsrb m3, [r2 + 4], 0
> >+
> >+pmaddubsw m3, [r5 + 24 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 284], m3
> >+
> >+; mode 20
> >+
> >+pmaddubsw m3, m1, [r5 + 11 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 288], m3
> >+
> >+pinsrb m2, [r2 + 2], 0
> >+
> >+pmaddubsw m3, m2, [r5 + 22 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 292], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 1 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 296], m3
> >+
> >+pslldq m3, m2, 2
> >+pinsrb m3, [r2 + 2], 1
> >+pinsrb m3, [r2 + 3], 0
> >+
> >+pmaddubsw m4, m3, [r5 + 12 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 300], m4
> >+
> >+; mode 21
> >+
> >+pmaddubsw m4, m1, [r5 + 15 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 304], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 30 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 308], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 13 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 312], m4
> >+
> >+pinsrb m3, [r2 + 4], 0
> >+
> >+pmaddubsw m3, [r5 + 28 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 316], m3
> >+
> >+; mode 22
> >+
> >+pmaddubsw m3, m1, [r5 + 19 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 320], m3
> >+
> >+movd [r0 + 324], m5
> >+
> >+pmaddubsw m3, m2, [r5 + 25 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 328], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 12 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 332], m3
> >+
> >+; mode 23
> >+
> >+pmaddubsw m3, m1, [r5 + 23 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 336], m3
> >+
> >+pmaddubsw m3, m1, [r5 + 14 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 340], m3
> >+
> >+pmaddubsw m3, m1, [r5 + 5 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 344], m3
> >+
> >+pinsrb m2, [r2 + 4], 0
> >+
> >+pmaddubsw m2, [r5 + 28 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 348], m2
> >+
> >+; mode 24
> >+
> >+pmaddubsw m2, m1, [r5 + 27 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 352], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 22 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 356], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 17 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 360], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 12 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 364], m2
> >+
> >+; mode 25
> >+
> >+pmaddubsw m2, m1, [r5 + 30 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 368], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 28 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 372], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 26 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 376], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 24 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 380], m2
> >+
> >+; mode 26
> >+
> >+movh m1, [r1 + 1]
> >+pshufd m2, m1, 0
> >+movu [r0 + 384], m2
> >+
> >+mova m2, [tab_Zero]
> >+
> >+pshufb m3, m1, m2
> >+punpcklbw m3, m2
> >+
> >+movh m4, [r2]
> >+
> >+pshufb m5, m4, m2
> >+punpcklbw m5, m2
> >+
> >+psrldq m4, 1
> >+punpcklbw m4, m2
> >+
> >+psubw m4, m5
> >+psraw m4, 1
> >+
> >+paddw m3, m4
> >+
> >+packuswb m3, m2
> >+
> >+pextrb [r0 + 384], m3, 0
> >+pextrb [r0 + 388], m3, 1
> >+pextrb [r0 + 392], m3, 2
> >+pextrb [r0 + 396], m3, 3
> >+
> >+; mode 27
> >+
> >+palignr m2, m1, 1
> >+punpcklbw m1, m2
> >+
> >+pmaddubsw m2, m1, [r5 + 2 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 400], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 4 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 404], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 6 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 408], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 8 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 412], m2
> >+
> >+; mode 28
> >+
> >+pmaddubsw m2, m1, [r5 + 5 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 416], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 10 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 420], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 15 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 424], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 20 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 428], m2
> >+
> >+; mode 29
> >+
> >+pmaddubsw m2, m1, [r5 + 9 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 432], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 18 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 436], m2
> >+
> >+pmaddubsw m2, m1, [r5 + 27 * 16]
> >+pmulhrsw m2, m0
> >+packuswb m2, m2
> >+movd [r0 + 440], m2
> >+
> >+palignr m2, m1, 2
> >+
> >+pmaddubsw m3, m2, [r5 + 4 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 444], m3
> >+
> >+; mode 30
> >+
> >+pmaddubsw m3, m1, [r5 + 13 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 448], m3
> >+
> >+pmaddubsw m6, m1, [r5 + 26 * 16]
> >+pmulhrsw m6, m0
> >+packuswb m6, m6
> >+movd [r0 + 452], m6
> >+
> >+pmaddubsw m3, m2, [r5 + 7 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 456], m3
> >+
> >+pmaddubsw m5, m2, [r5 + 20 * 16]
> >+pmulhrsw m5, m0
> >+packuswb m5, m5
> >+movd [r0 + 460], m5
> >+
> >+; mode 31
> >+
> >+pmaddubsw m3, m1, [r5 + 17 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 464], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 2 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 468], m3
> >+
> >+pmaddubsw m3, m2, [r5 + 19 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 472], m3
> >+
> >+palignr m3, m2, 2
> >+
> >+pmaddubsw m4, m3, [r5 + 4 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 476], m4
> >+
> >+; mode 32
> >+
> >+pmaddubsw m4, m1, [r5 + 21 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 480], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 10 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 484], m4
> >+
> >+pmaddubsw m4, m2, [r5 + 31 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 488], m4
> >+
> >+pmaddubsw m4, m3, [r5 + 20 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 492], m4
> >+
> >+; mode 33
> >+
> >+movd [r0 + 496], m6
> >+
> >+movd [r0 + 500], m5
> >+
> >+pmaddubsw m4, m3, [r5 + 14 * 16]
> >+pmulhrsw m4, m0
> >+packuswb m4, m4
> >+movd [r0 + 504], m4
> >+
> >+psrldq m3, 2
> >+
> >+pmaddubsw m3, [r5 + 8 * 16]
> >+pmulhrsw m3, m0
> >+packuswb m3, m3
> >+movd [r0 + 508], m3
> >+
> >+; mode 34
> >+
> >+movh m0, [r1 + 2]
> >+movd [r0 + 512], m0
> >+
> >+palignr m1, m0, 1
> >+movd [r0 + 516], m1
> >+
> >+palignr m1, m0, 2
> >+movd [r0 + 520], m1
> >+
> >+palignr m1, m0, 3
> >+movd [r0 + 524], m1
> >+
> >+RET
> >diff -r 1d2d60f4eb81 -r 1e130f825cbb source/test/intrapredharness.cpp
> >--- a/source/test/intrapredharness.cpp Tue Dec 03 23:56:22 2013 -0600
> >+++ b/source/test/intrapredharness.cpp Wed Dec 04 19:26:58 2013 +0530
> >@@ -210,7 +210,7 @@
> >
> > for (int i = 0; i <= 100; i++)
> > {
> >- isLuma = (width <= 16) && (rand() % 2);
>
> >+ isLuma = (width <= 16) ? true : false; // bFilter is true for 4x4, 8x8, 16x16 and false for 32x32
> >
> > pixel * refAbove0 = pixel_buff + j;
> > pixel * refLeft0 = refAbove0 + 3 * width;
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131204/7b747bd3/attachment-0001.html>
More information about the x265-devel
mailing list