[x265] [PATCH Review only] asm: code for sse_pp_12x16 routine
chen
chenm003 at 163.com
Thu Nov 21 16:49:50 CET 2013
> ;-----------------------------------------------------------------------------
>+; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2
>+
>+ pxor m6, m6
>+ mov r4d, 4
>+
>+.loop
>+ movu m0, [r0]
>+ movu m1, [r2]
>+ movu m2, [r0 + r1]
>+ movu m3, [r2 + r3]
>+
>+ mova m4, m0
>+ mova m5, m1
>+ punpckhdq m4, m2
>+ punpckhdq m5, m3
punpckhdq m4, m0, m2
on AVX and later it is faster
>+
>+ pmovzxbw m0, m0
>+ pmovzxbw m1, m1
>+ pmovzxbw m2, m2
>+ pmovzxbw m3, m3
>+ pmovzxbw m4, m4
>+ pmovzxbw m5, m5
>+
>+ psubw m0, m1
>+ psubw m2, m3
>+ psubw m4, m5
>+
>+ pmaddwd m0, m0
>+ pmaddwd m2, m2
>+ pmaddwd m4, m4
>+
>+ paddd m6, m0
>+ paddd m6, m2
>+ paddd m6, m4
paddd m0, m2
paddd m6, m4
paddd m6, m0
>+ movu m0, [r0 + 2 * r1]
>+ movu m1, [r2 + 2 * r3]
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+ movu m2, [r0 + r1]
>+ movu m3, [r2 + r3]
>+
>+ mova m4, m0
>+ mova m5, m1
>+ punpckhdq m4, m2
>+ punpckhdq m5, m3
>+
>+ pmovzxbw m0, m0
>+ pmovzxbw m1, m1
>+ pmovzxbw m2, m2
>+ pmovzxbw m3, m3
>+ pmovzxbw m4, m4
>+ pmovzxbw m5, m5
>+
>+ psubw m0, m1
>+ psubw m2, m3
>+ psubw m4, m5
>+
>+ pmaddwd m0, m0
>+ pmaddwd m2, m2
>+ pmaddwd m4, m4
>+
>+ paddd m6, m0
>+ paddd m6, m2
>+ paddd m6, m4
>+
>+ lea r0, [r0 + 2 * r1]
>+ lea r2, [r2 + 2 * r3]
>+ dec r4d
>+ jnz .loop
dec may move front to hidden flag register latency
>+
>+ HADDD m6, m1
>+ movd eax, m6
>+
>+ RET
>+
>+;-----------------------------------------------------------------------------
> ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,
> ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
> ;
>diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel.h
>--- a/source/common/x86/pixel.h Thu Nov 21 14:44:06 2013 +0530
>+++ b/source/common/x86/pixel.h Thu Nov 21 20:18:13 2013 +0530
>@@ -372,5 +372,6 @@
> uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);
> void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
>+int x265_pixel_ssd_12x16_ssse3(pixel *, intptr_t, pixel *, intptr_t);
>
> #endif // ifndef X265_I386_PIXEL_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131121/0f09ef55/attachment.html>
More information about the x265-devel
mailing list