[x265] [PATCH] asm: code for intra_pred[BLOCK_32x32] mode 2 and 34

chen chenm003 at 163.com
Wed Jan 15 11:11:41 CET 2014


Code is right, but can be improvement.
1. you have more free register, try to save r1*3 in a register, so you can reduce many LEA
2. you can load m4 later, so you can reuse other register to reduce register number.

At 2014-01-15 18:04:02,dnyaneshwar at multicorewareinc.com wrote:
>+;---------------------------------------------------------------------------------------------------------------
>+; void intraPredAng32(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
>+;---------------------------------------------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal intra_pred_ang32_2, 3,3,5
>+    cmp             r4m, byte 34
>+    cmove           r2, r3mp
>+    movu            m0, [r2 + 2]
>+    movu            m1, [r2 + 18]
>+    movu            m3, [r2 + 34]
>+    movu            m4, [r2 + 50]
>+    movu            [r0], m0
>+    movu            [r0 + 16], m1
>+    palignr         m2, m1, m0, 1
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 1
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 2
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 2
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 3
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 3
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 4
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 4
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 5
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 5
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 6
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 6
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 7
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 7
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 8
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 8
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 9
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 9
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 10
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 10
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 11
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 11
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 12
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 12
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 13
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 13
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 14
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 14
>+    movu            [r0 + 16], m2
>+    palignr         m2, m1, m0, 15
>+    movu            [r0 + r1], m2
>+    palignr         m2, m3, m1, 15
>+    movu            [r0 + r1 + 16], m2
>+
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m1, m0, 16
>+    movu            [r0], m2
>+    palignr         m2, m3, m1, 16
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 1
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 1
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 2
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 2
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 3
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 3
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 4
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 4
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 5
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 5
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 6
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 6
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 7
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 7
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 8
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 8
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 9
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 9
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 10
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 10
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 11
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 11
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 12
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 12
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 13
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 13
>+    movu            [r0 + r1 + 16], m2
>+    lea             r0, [r0 + r1 * 2]
>+    palignr         m2, m3, m1, 14
>+    movu            [r0], m2
>+    palignr         m2, m4, m3, 14
>+    movu            [r0 + 16], m2
>+    palignr         m2, m3, m1, 15
>+    movu            [r0 + r1], m2
>+    palignr         m2, m4, m3, 15
>+    movu            [r0 + r1 + 16], m2
>+
>+    RET
>+
> ;-----------------------------------------------------------------------------
> ; void all_angs_pred_4x4(pixel *dest, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma)
> ;-----------------------------------------------------------------------------
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140115/d9306580/attachment-0001.html>


More information about the x265-devel mailing list