[x265] [PATCH Review only] asm: pixelsub_ps routine for 2xN blocks
chen
chenm003 at 163.com
Mon Nov 11 14:31:10 CET 2013
> ;-----------------------------------------------------------------------------
>+; void pixel_sub_ps_c_2x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
>+;-----------------------------------------------------------------------------
>+INIT_XMM sse4
>+cglobal pixel_sub_ps_2x4, 6, 7, 2, dest, deststride, src0, src1, srcstride0, srcstride1
>+
>+add r1, r1
>+
>+movd m0, [r2]
>+movd m1, [r2 + r4]
>+movd m2, [r2 + 2 * r4]
I don't worry about small block performance, but if you use below code, it is short and faster
movd m0, [r2]
movhps m0, [r2 + r4]
>+
>+movd m3, [r3]
>+movd m4, [r3 + r5]
>+movd m5, [r3 + 2 * r5]
>+
>+lea r2, [r2 + 2 * r4]
>+lea r3, [r3 + 2 * r5]
>+
>+movd m6, [r2 + r4]
>+movd m7, [r3 + r5]
>+
>+pmovzxbw m0, m0
>+pmovzxbw m1, m1
>+pmovzxbw m2, m2
>+pmovzxbw m3, m3
>+pmovzxbw m4, m4
>+pmovzxbw m5, m5
>+pmovzxbw m6, m6
>+pmovzxbw m7, m7
>+
>+psubw m0, m3
>+psubw m1, m4
>+psubw m2, m5
>+psubw m6, m7
here only half of pmovzxbw and psub when use above code.
>+movd [r0], m0
>+movd [r0 + r1], m1
>+movd [r0 + 2* r1], m2
>+
>+lea r0, [r0 + 2 * r1]
>+
>+movd [r0 + r1], m6
>+
>+RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131111/8d603171/attachment.html>
More information about the x265-devel
mailing list