<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><DIV>> ;-----------------------------------------------------------------------------<BR>>+; int pixel_ssd_12x16( uint8_t *, intptr_t, uint8_t *, intptr_t )<BR>>+;-----------------------------------------------------------------------------<BR>>+INIT_XMM ssse3<BR>>+cglobal pixel_ssd_12x16, 4, 5, 7, src1, stride1, src2, stride2<BR>>+<BR>>+ pxor m6, m6<BR>>+ mov r4d, 4<BR>>+<BR>>+.loop<BR>>+ movu m0, [r0]<BR>>+ movu m1, [r2]<BR>>+ movu m2, [r0 + r1]<BR>>+ movu m3, [r2 + r3]<BR>>+<BR>>+ mova m4, m0<BR>>+ mova m5, m1<BR>>+ punpckhdq m4, m2<BR>>+ punpckhdq m5, m3<BR>punpckhdq m4, m0, m2</DIV>
<DIV>on AVX and later it is faster</DIV>
<DIV> </DIV>
<DIV>>+<BR>>+ pmovzxbw m0, m0<BR>>+ pmovzxbw m1, m1<BR>>+ pmovzxbw m2, m2<BR>>+ pmovzxbw m3, m3<BR>>+ pmovzxbw m4, m4<BR>>+ pmovzxbw m5, m5<BR>>+<BR>>+ psubw m0, m1<BR>>+ psubw m2, m3<BR>>+ psubw m4, m5<BR>>+<BR>>+ pmaddwd m0, m0<BR>>+ pmaddwd m2, m2<BR>>+ pmaddwd m4, m4<BR>>+<BR>>+ paddd m6, m0<BR>>+ paddd m6, m2<BR>>+ paddd m6, m4<BR>paddd m0, m2</DIV>
<DIV>paddd m6, m4</DIV>
<DIV>paddd m6, m0</DIV>
<DIV><BR>>+ movu m0, [r0 + 2 * r1]<BR>>+ movu m1, [r2 + 2 * r3]<BR>>+ lea r0, [r0 + 2 * r1]<BR>>+ lea r2, [r2 + 2 * r3]<BR>>+ movu m2, [r0 + r1]<BR>>+ movu m3, [r2 + r3]<BR>>+<BR>>+ mova m4, m0<BR>>+ mova m5, m1<BR>>+ punpckhdq m4, m2<BR>>+ punpckhdq m5, m3<BR>>+<BR>>+ pmovzxbw m0, m0<BR>>+ pmovzxbw m1, m1<BR>>+ pmovzxbw m2, m2<BR>>+ pmovzxbw m3, m3<BR>>+ pmovzxbw m4, m4<BR>>+ pmovzxbw m5, m5<BR>>+<BR>>+ psubw m0, m1<BR>>+ psubw m2, m3<BR>>+ psubw m4, m5<BR>>+<BR>>+ pmaddwd m0, m0<BR>>+ pmaddwd m2, m2<BR>>+ pmaddwd m4, m4<BR>>+<BR>>+ paddd m6, m0<BR>>+ paddd m6, m2<BR>>+ paddd m6, m4<BR>>+<BR>>+ lea r0, [r0 + 2 * r1]<BR>>+ lea r2, [r2 + 2 * r3]<BR>>+ dec r4d<BR>>+ jnz .loop<BR>dec may move front to hidden flag register latency</DIV>
<DIV> </DIV>
<DIV>>+<BR>>+ HADDD m6, m1<BR>>+ movd eax, m6<BR>>+<BR>>+ RET<BR>>+<BR>>+;-----------------------------------------------------------------------------<BR>> ; void pixel_ssd_nv12_core( uint16_t *pixuv1, intptr_t stride1, uint16_t *pixuv2, intptr_t stride2,<BR>> ; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )<BR>> ;<BR>>diff -r 44d15f8ce940 -r 289b23f3ec9e source/common/x86/pixel.h<BR>>--- a/source/common/x86/pixel.h Thu Nov 21 14:44:06 2013 +0530<BR>>+++ b/source/common/x86/pixel.h Thu Nov 21 20:18:13 2013 +0530<BR>>@@ -372,5 +372,6 @@<BR>> uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);<BR>> void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);<BR>> void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);<BR>>+int x265_pixel_ssd_12x16_ssse3(pixel *, intptr_t, pixel *, intptr_t);<BR>> <BR>> #endif // ifndef X265_I386_PIXEL_H<BR>>_______________________________________________<BR>>x265-devel mailing list<BR>>x265-devel@videolan.org<BR>>https://mailman.videolan.org/listinfo/x265-devel<BR></DIV></div>