[x265] [PATCH Review only] asm: pixelsub_ps routine for 2xN blocks

chen chenm003 at 163.com
Mon Nov 11 14:31:10 CET 2013


> ;-----------------------------------------------------------------------------
>+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x4, 6, 7, 2, dest, deststride, src0, src1, srcstride0, srcstride1
>+
>+add         r1,    r1
>+
>+movd        m0,    [r2]
>+movd        m1,    [r2 + r4]
>+movd        m2,    [r2 + 2 * r4]
I don't worry about small block performance, but if you use below code, it is short and faster
movd m0, [r2]
movhps m0, [r2 + r4]
 
>+
>+movd        m3,    [r3]
>+movd        m4,    [r3 + r5]
>+movd        m5,    [r3 + 2 * r5]
>+
>+lea         r2,    [r2 + 2 * r4]
>+lea         r3,    [r3 + 2 * r5]
>+
>+movd        m6,    [r2 + r4]
>+movd        m7,    [r3 + r5]
>+
>+pmovzxbw    m0,    m0
>+pmovzxbw    m1,    m1
>+pmovzxbw    m2,    m2
>+pmovzxbw    m3,    m3
>+pmovzxbw    m4,    m4
>+pmovzxbw    m5,    m5
>+pmovzxbw    m6,    m6
>+pmovzxbw    m7,    m7
>+
>+psubw       m0,    m3
>+psubw       m1,    m4
>+psubw       m2,    m5
>+psubw       m6,    m7
here only half of pmovzxbw and psub when use above code.

>+movd    [r0],            m0
>+movd    [r0 + r1],       m1
>+movd    [r0 + 2* r1],    m2
>+
>+lea     r0,              [r0 + 2 * r1]
>+
>+movd    [r0 + r1],       m6
>+
>+RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131111/8d603171/attachment.html>


More information about the x265-devel mailing list